diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 31cd43dd5943ab..12b918faa46aa4 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -662,6 +662,7 @@ Python Binding Changes OpenMP Support -------------- - Added support for 'omp assume' directive. +- Added support for 'omp scope' directive. Improvements ^^^^^^^^^^^^ diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h index 2f6cd481fd6362..eef7a54f03bf11 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h @@ -326,14 +326,14 @@ class ProgramState : public llvm::FoldingSetNode { /// \param ITraits information about special handling for particular regions /// or symbols. [[nodiscard]] ProgramStateRef - invalidateRegions(ArrayRef Regions, const Expr *E, + invalidateRegions(ArrayRef Regions, const Stmt *S, unsigned BlockCount, const LocationContext *LCtx, bool CausesPointerEscape, InvalidatedSymbols *IS = nullptr, const CallEvent *Call = nullptr, RegionAndSymbolInvalidationTraits *ITraits = nullptr) const; [[nodiscard]] ProgramStateRef - invalidateRegions(ArrayRef Values, const Expr *E, unsigned BlockCount, + invalidateRegions(ArrayRef Values, const Stmt *S, unsigned BlockCount, const LocationContext *LCtx, bool CausesPointerEscape, InvalidatedSymbols *IS = nullptr, const CallEvent *Call = nullptr, diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h index 6eedaf0544559b..ec2b2b24569480 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h @@ -202,11 +202,9 @@ class SValBuilder { const Expr *expr, const LocationContext *LCtx, unsigned count); - DefinedOrUnknownSVal conjureSymbolVal(const void *symbolTag, - const Expr *expr, + DefinedOrUnknownSVal conjureSymbolVal(const void *symbolTag, const Stmt *S, const LocationContext *LCtx, - QualType type, - unsigned count); + QualType type, unsigned count); DefinedOrUnknownSVal conjureSymbolVal(const Stmt *stmt, const LocationContext *LCtx, QualType type, diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h index e08d5e104e9c0a..332855a3c9c45e 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h @@ -215,7 +215,7 @@ class StoreManager { /// /// \param[in] store The initial store. /// \param[in] Values The values to invalidate. - /// \param[in] E The current statement being evaluated. Used to conjure + /// \param[in] S The current statement being evaluated. Used to conjure /// symbols to mark the values of invalidated regions. /// \param[in] Count The current block count. Used to conjure /// symbols to mark the values of invalidated regions. @@ -233,7 +233,7 @@ class StoreManager { /// even if they do not currently have bindings. Pass \c NULL if this /// information will not be used. virtual StoreRef invalidateRegions( - Store store, ArrayRef Values, const Expr *Ex, unsigned Count, + Store store, ArrayRef Values, const Stmt *S, unsigned Count, const LocationContext *LCtx, const CallEvent *Call, InvalidatedSymbols &IS, RegionAndSymbolInvalidationTraits &ITraits, InvalidatedRegions *TopLevelRegions, InvalidatedRegions *Invalidated) = 0; diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 754cd0db9868b7..785918846976d4 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -1282,9 +1282,8 @@ bool Compiler::VisitVectorBinOp(const BinaryOperator *E) { ? BinaryOperator::getOpForCompoundAssignment(E->getOpcode()) : E->getOpcode(); - // The LHS and RHS of a comparison operator must have the same type. So we - // just use LHS vector element type here. PrimType ElemT = this->classifyVectorElementType(LHS->getType()); + PrimType RHSElemT = this->classifyVectorElementType(RHS->getType()); PrimType ResultElemT = this->classifyVectorElementType(E->getType()); // Evaluate LHS and save value to LHSOffset. @@ -1312,7 +1311,7 @@ bool Compiler::VisitVectorBinOp(const BinaryOperator *E) { PrimType PromotT = classifyPrim(PromotTy); PrimType OpT = NeedIntPromot ? PromotT : ElemT; - auto getElem = [=](unsigned Offset, unsigned Index) { + auto getElem = [=](unsigned Offset, PrimType ElemT, unsigned Index) { if (!this->emitGetLocal(PT_Ptr, Offset, E)) return false; if (!this->emitArrayElemPop(ElemT, Index, E)) @@ -1342,9 +1341,9 @@ bool Compiler::VisitVectorBinOp(const BinaryOperator *E) { } for (unsigned I = 0; I != VecTy->getNumElements(); ++I) { - if (!getElem(LHSOffset, I)) + if (!getElem(LHSOffset, ElemT, I)) return false; - if (!getElem(RHSOffset, I)) + if (!getElem(RHSOffset, RHSElemT, I)) return false; switch (Op) { case BO_Add: @@ -1372,11 +1371,11 @@ bool Compiler::VisitVectorBinOp(const BinaryOperator *E) { return false; break; case BO_Shl: - if (!this->emitShl(OpT, ElemT, E)) + if (!this->emitShl(OpT, RHSElemT, E)) return false; break; case BO_Shr: - if (!this->emitShr(OpT, ElemT, E)) + if (!this->emitShr(OpT, RHSElemT, E)) return false; break; case BO_EQ: diff --git a/clang/lib/Frontend/Rewrite/RewriteObjC.cpp b/clang/lib/Frontend/Rewrite/RewriteObjC.cpp index a1e792bf772ba2..f49ccf7be68e22 100644 --- a/clang/lib/Frontend/Rewrite/RewriteObjC.cpp +++ b/clang/lib/Frontend/Rewrite/RewriteObjC.cpp @@ -128,10 +128,8 @@ namespace { SmallVector BlockDeclRefs; // Block related declarations. - SmallVector BlockByCopyDecls; - llvm::SmallPtrSet BlockByCopyDeclsPtrSet; - SmallVector BlockByRefDecls; - llvm::SmallPtrSet BlockByRefDeclsPtrSet; + llvm::SmallSetVector BlockByCopyDecls; + llvm::SmallSetVector BlockByRefDecls; llvm::DenseMap BlockByRefDeclNo; llvm::SmallPtrSet ImportedBlockDecls; llvm::SmallPtrSet ImportedLocalExternalDecls; @@ -3357,7 +3355,7 @@ std::string RewriteObjC::SynthesizeBlockHelperFuncs(BlockExpr *CE, int i, S += VD->getNameAsString(); S += ", (void*)src->"; S += VD->getNameAsString(); - if (BlockByRefDeclsPtrSet.count(VD)) + if (BlockByRefDecls.contains(VD)) S += ", " + utostr(BLOCK_FIELD_IS_BYREF) + "/*BLOCK_FIELD_IS_BYREF*/);"; else if (VD->getType()->isBlockPointerType()) S += ", " + utostr(BLOCK_FIELD_IS_BLOCK) + "/*BLOCK_FIELD_IS_BLOCK*/);"; @@ -3374,7 +3372,7 @@ std::string RewriteObjC::SynthesizeBlockHelperFuncs(BlockExpr *CE, int i, for (ValueDecl *VD : ImportedBlockDecls) { S += "_Block_object_dispose((void*)src->"; S += VD->getNameAsString(); - if (BlockByRefDeclsPtrSet.count(VD)) + if (BlockByRefDecls.contains(VD)) S += ", " + utostr(BLOCK_FIELD_IS_BYREF) + "/*BLOCK_FIELD_IS_BYREF*/);"; else if (VD->getType()->isBlockPointerType()) S += ", " + utostr(BLOCK_FIELD_IS_BLOCK) + "/*BLOCK_FIELD_IS_BLOCK*/);"; @@ -3553,14 +3551,10 @@ void RewriteObjC::SynthesizeBlockLiterals(SourceLocation FunLocStart, DeclRefExpr *Exp = InnerDeclRefs[count++]; ValueDecl *VD = Exp->getDecl(); BlockDeclRefs.push_back(Exp); - if (!VD->hasAttr() && !BlockByCopyDeclsPtrSet.count(VD)) { - BlockByCopyDeclsPtrSet.insert(VD); - BlockByCopyDecls.push_back(VD); - } - if (VD->hasAttr() && !BlockByRefDeclsPtrSet.count(VD)) { - BlockByRefDeclsPtrSet.insert(VD); - BlockByRefDecls.push_back(VD); - } + if (VD->hasAttr()) + BlockByRefDecls.insert(VD); + else + BlockByCopyDecls.insert(VD); // imported objects in the inner blocks not used in the outer // blocks must be copied/disposed in the outer block as well. if (VD->hasAttr() || @@ -3590,9 +3584,7 @@ void RewriteObjC::SynthesizeBlockLiterals(SourceLocation FunLocStart, BlockDeclRefs.clear(); BlockByRefDecls.clear(); - BlockByRefDeclsPtrSet.clear(); BlockByCopyDecls.clear(); - BlockByCopyDeclsPtrSet.clear(); ImportedBlockDecls.clear(); } if (RewriteSC) { @@ -4314,20 +4306,12 @@ void RewriteObjC::CollectBlockDeclRefInfo(BlockExpr *Exp) { if (BlockDeclRefs.size()) { // Unique all "by copy" declarations. for (unsigned i = 0; i < BlockDeclRefs.size(); i++) - if (!BlockDeclRefs[i]->getDecl()->hasAttr()) { - if (!BlockByCopyDeclsPtrSet.count(BlockDeclRefs[i]->getDecl())) { - BlockByCopyDeclsPtrSet.insert(BlockDeclRefs[i]->getDecl()); - BlockByCopyDecls.push_back(BlockDeclRefs[i]->getDecl()); - } - } + if (!BlockDeclRefs[i]->getDecl()->hasAttr()) + BlockByCopyDecls.insert(BlockDeclRefs[i]->getDecl()); // Unique all "by ref" declarations. for (unsigned i = 0; i < BlockDeclRefs.size(); i++) - if (BlockDeclRefs[i]->getDecl()->hasAttr()) { - if (!BlockByRefDeclsPtrSet.count(BlockDeclRefs[i]->getDecl())) { - BlockByRefDeclsPtrSet.insert(BlockDeclRefs[i]->getDecl()); - BlockByRefDecls.push_back(BlockDeclRefs[i]->getDecl()); - } - } + if (BlockDeclRefs[i]->getDecl()->hasAttr()) + BlockByRefDecls.insert(BlockDeclRefs[i]->getDecl()); // Find any imported blocks...they will need special attention. for (unsigned i = 0; i < BlockDeclRefs.size(); i++) if (BlockDeclRefs[i]->getDecl()->hasAttr() || @@ -4358,22 +4342,18 @@ Stmt *RewriteObjC::SynthBlockInitExpr(BlockExpr *Exp, for (unsigned i = 0; i < InnerBlockDeclRefs.size(); i++) { DeclRefExpr *Exp = InnerBlockDeclRefs[i]; ValueDecl *VD = Exp->getDecl(); - if (!VD->hasAttr() && - BlockByCopyDeclsPtrSet.insert(VD).second) { + if (!VD->hasAttr() && BlockByCopyDecls.insert(VD)) { // We need to save the copied-in variables in nested // blocks because it is needed at the end for some of the API // generations. See SynthesizeBlockLiterals routine. InnerDeclRefs.push_back(Exp); countOfInnerDecls++; BlockDeclRefs.push_back(Exp); - BlockByCopyDecls.push_back(VD); } - if (VD->hasAttr() && - BlockByRefDeclsPtrSet.insert(VD).second) { + if (VD->hasAttr() && BlockByRefDecls.insert(VD)) { InnerDeclRefs.push_back(Exp); countOfInnerDecls++; BlockDeclRefs.push_back(Exp); - BlockByRefDecls.push_back(VD); } } // Find any imported blocks...they will need special attention. @@ -4534,9 +4514,7 @@ Stmt *RewriteObjC::SynthBlockInitExpr(BlockExpr *Exp, NewRep); BlockDeclRefs.clear(); BlockByRefDecls.clear(); - BlockByRefDeclsPtrSet.clear(); BlockByCopyDecls.clear(); - BlockByCopyDeclsPtrSet.clear(); ImportedBlockDecls.clear(); return NewRep; } diff --git a/clang/lib/StaticAnalyzer/Core/ProgramState.cpp b/clang/lib/StaticAnalyzer/Core/ProgramState.cpp index e6d3399a219424..0be2709f0907d8 100644 --- a/clang/lib/StaticAnalyzer/Core/ProgramState.cpp +++ b/clang/lib/StaticAnalyzer/Core/ProgramState.cpp @@ -147,30 +147,24 @@ ProgramState::bindDefaultZero(SVal loc, const LocationContext *LCtx) const { typedef ArrayRef RegionList; typedef ArrayRef ValueList; -ProgramStateRef -ProgramState::invalidateRegions(RegionList Regions, - const Expr *E, unsigned Count, - const LocationContext *LCtx, - bool CausedByPointerEscape, - InvalidatedSymbols *IS, - const CallEvent *Call, - RegionAndSymbolInvalidationTraits *ITraits) const { +ProgramStateRef ProgramState::invalidateRegions( + RegionList Regions, const Stmt *S, unsigned Count, + const LocationContext *LCtx, bool CausedByPointerEscape, + InvalidatedSymbols *IS, const CallEvent *Call, + RegionAndSymbolInvalidationTraits *ITraits) const { SmallVector Values; for (const MemRegion *Reg : Regions) Values.push_back(loc::MemRegionVal(Reg)); - return invalidateRegions(Values, E, Count, LCtx, CausedByPointerEscape, IS, + return invalidateRegions(Values, S, Count, LCtx, CausedByPointerEscape, IS, Call, ITraits); } -ProgramStateRef -ProgramState::invalidateRegions(ValueList Values, - const Expr *E, unsigned Count, - const LocationContext *LCtx, - bool CausedByPointerEscape, - InvalidatedSymbols *IS, - const CallEvent *Call, - RegionAndSymbolInvalidationTraits *ITraits) const { +ProgramStateRef ProgramState::invalidateRegions( + ValueList Values, const Stmt *S, unsigned Count, + const LocationContext *LCtx, bool CausedByPointerEscape, + InvalidatedSymbols *IS, const CallEvent *Call, + RegionAndSymbolInvalidationTraits *ITraits) const { ProgramStateManager &Mgr = getStateManager(); ExprEngine &Eng = Mgr.getOwningEngine(); @@ -186,7 +180,7 @@ ProgramState::invalidateRegions(ValueList Values, StoreManager::InvalidatedRegions TopLevelInvalidated; StoreManager::InvalidatedRegions Invalidated; const StoreRef &NewStore = Mgr.StoreMgr->invalidateRegions( - getStore(), Values, E, Count, LCtx, Call, *IS, *ITraits, + getStore(), Values, S, Count, LCtx, Call, *IS, *ITraits, &TopLevelInvalidated, &Invalidated); ProgramStateRef NewState = makeWithStore(NewStore); diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp index c257a87dff385b..674099dd7e1f0f 100644 --- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp +++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp @@ -405,19 +405,15 @@ class RegionStoreManager : public StoreManager { //===-------------------------------------------------------------------===// // Binding values to regions. //===-------------------------------------------------------------------===// - RegionBindingsRef invalidateGlobalRegion(MemRegion::Kind K, - const Expr *Ex, + RegionBindingsRef invalidateGlobalRegion(MemRegion::Kind K, const Stmt *S, unsigned Count, const LocationContext *LCtx, RegionBindingsRef B, InvalidatedRegions *Invalidated); - StoreRef invalidateRegions(Store store, - ArrayRef Values, - const Expr *E, unsigned Count, - const LocationContext *LCtx, - const CallEvent *Call, - InvalidatedSymbols &IS, + StoreRef invalidateRegions(Store store, ArrayRef Values, const Stmt *S, + unsigned Count, const LocationContext *LCtx, + const CallEvent *Call, InvalidatedSymbols &IS, RegionAndSymbolInvalidationTraits &ITraits, InvalidatedRegions *Invalidated, InvalidatedRegions *InvalidatedTopLevel) override; @@ -975,7 +971,7 @@ RegionStoreManager::removeSubRegionBindings(RegionBindingsConstRef B, namespace { class InvalidateRegionsWorker : public ClusterAnalysis { - const Expr *Ex; + const Stmt *S; unsigned Count; const LocationContext *LCtx; InvalidatedSymbols &IS; @@ -983,18 +979,15 @@ class InvalidateRegionsWorker : public ClusterAnalysis StoreManager::InvalidatedRegions *Regions; GlobalsFilterKind GlobalsFilter; public: - InvalidateRegionsWorker(RegionStoreManager &rm, - ProgramStateManager &stateMgr, - RegionBindingsRef b, - const Expr *ex, unsigned count, - const LocationContext *lctx, - InvalidatedSymbols &is, + InvalidateRegionsWorker(RegionStoreManager &rm, ProgramStateManager &stateMgr, + RegionBindingsRef b, const Stmt *S, unsigned count, + const LocationContext *lctx, InvalidatedSymbols &is, RegionAndSymbolInvalidationTraits &ITraitsIn, StoreManager::InvalidatedRegions *r, GlobalsFilterKind GFK) - : ClusterAnalysis(rm, stateMgr, b), - Ex(ex), Count(count), LCtx(lctx), IS(is), ITraits(ITraitsIn), Regions(r), - GlobalsFilter(GFK) {} + : ClusterAnalysis(rm, stateMgr, b), S(S), + Count(count), LCtx(lctx), IS(is), ITraits(ITraitsIn), Regions(r), + GlobalsFilter(GFK) {} void VisitCluster(const MemRegion *baseR, const ClusterBindings *C); void VisitBinding(SVal V); @@ -1127,7 +1120,7 @@ void InvalidateRegionsWorker::VisitCluster(const MemRegion *baseR, // Invalidate the region by setting its default value to // conjured symbol. The type of the symbol is irrelevant. DefinedOrUnknownSVal V = - svalBuilder.conjureSymbolVal(baseR, Ex, LCtx, Ctx.IntTy, Count); + svalBuilder.conjureSymbolVal(baseR, S, LCtx, Ctx.IntTy, Count); B = B.addBinding(baseR, BindingKey::Default, V); return; } @@ -1148,8 +1141,8 @@ void InvalidateRegionsWorker::VisitCluster(const MemRegion *baseR, if (T->isRecordType()) { // Invalidate the region by setting its default value to // conjured symbol. The type of the symbol is irrelevant. - DefinedOrUnknownSVal V = svalBuilder.conjureSymbolVal(baseR, Ex, LCtx, - Ctx.IntTy, Count); + DefinedOrUnknownSVal V = + svalBuilder.conjureSymbolVal(baseR, S, LCtx, Ctx.IntTy, Count); B = B.addBinding(baseR, BindingKey::Default, V); return; } @@ -1216,15 +1209,14 @@ void InvalidateRegionsWorker::VisitCluster(const MemRegion *baseR, } conjure_default: // Set the default value of the array to conjured symbol. - DefinedOrUnknownSVal V = - svalBuilder.conjureSymbolVal(baseR, Ex, LCtx, - AT->getElementType(), Count); - B = B.addBinding(baseR, BindingKey::Default, V); - return; + DefinedOrUnknownSVal V = svalBuilder.conjureSymbolVal( + baseR, S, LCtx, AT->getElementType(), Count); + B = B.addBinding(baseR, BindingKey::Default, V); + return; } - DefinedOrUnknownSVal V = svalBuilder.conjureSymbolVal(baseR, Ex, LCtx, - T,Count); + DefinedOrUnknownSVal V = + svalBuilder.conjureSymbolVal(baseR, S, LCtx, T, Count); assert(SymbolManager::canSymbolicate(T) || V.isUnknown()); B = B.addBinding(baseR, BindingKey::Direct, V); } @@ -1252,19 +1244,16 @@ bool InvalidateRegionsWorker::includeEntireMemorySpace(const MemRegion *Base) { RegionAndSymbolInvalidationTraits::TK_EntireMemSpace); } -RegionBindingsRef -RegionStoreManager::invalidateGlobalRegion(MemRegion::Kind K, - const Expr *Ex, - unsigned Count, - const LocationContext *LCtx, - RegionBindingsRef B, - InvalidatedRegions *Invalidated) { +RegionBindingsRef RegionStoreManager::invalidateGlobalRegion( + MemRegion::Kind K, const Stmt *S, unsigned Count, + const LocationContext *LCtx, RegionBindingsRef B, + InvalidatedRegions *Invalidated) { // Bind the globals memory space to a new symbol that we will use to derive // the bindings for all globals. const GlobalsSpaceRegion *GS = MRMgr.getGlobalsRegion(K); - SVal V = svalBuilder.conjureSymbolVal(/* symbolTag = */ (const void*) GS, Ex, LCtx, - /* type does not matter */ Ctx.IntTy, - Count); + SVal V = + svalBuilder.conjureSymbolVal(/* symbolTag = */ (const void *)GS, S, LCtx, + /* type does not matter */ Ctx.IntTy, Count); B = B.removeBinding(GS) .addBinding(BindingKey::Make(GS, BindingKey::Default), V); @@ -1298,16 +1287,11 @@ void RegionStoreManager::populateWorkList(InvalidateRegionsWorker &W, } } -StoreRef -RegionStoreManager::invalidateRegions(Store store, - ArrayRef Values, - const Expr *Ex, unsigned Count, - const LocationContext *LCtx, - const CallEvent *Call, - InvalidatedSymbols &IS, - RegionAndSymbolInvalidationTraits &ITraits, - InvalidatedRegions *TopLevelRegions, - InvalidatedRegions *Invalidated) { +StoreRef RegionStoreManager::invalidateRegions( + Store store, ArrayRef Values, const Stmt *S, unsigned Count, + const LocationContext *LCtx, const CallEvent *Call, InvalidatedSymbols &IS, + RegionAndSymbolInvalidationTraits &ITraits, + InvalidatedRegions *TopLevelRegions, InvalidatedRegions *Invalidated) { GlobalsFilterKind GlobalsFilter; if (Call) { if (Call->isInSystemHeader()) @@ -1319,7 +1303,7 @@ RegionStoreManager::invalidateRegions(Store store, } RegionBindingsRef B = getRegionBindings(store); - InvalidateRegionsWorker W(*this, StateMgr, B, Ex, Count, LCtx, IS, ITraits, + InvalidateRegionsWorker W(*this, StateMgr, B, S, Count, LCtx, IS, ITraits, Invalidated, GlobalsFilter); // Scan the bindings and generate the clusters. @@ -1339,12 +1323,12 @@ RegionStoreManager::invalidateRegions(Store store, // TODO: This could possibly be more precise with modules. switch (GlobalsFilter) { case GFK_All: - B = invalidateGlobalRegion(MemRegion::GlobalInternalSpaceRegionKind, - Ex, Count, LCtx, B, Invalidated); + B = invalidateGlobalRegion(MemRegion::GlobalInternalSpaceRegionKind, S, + Count, LCtx, B, Invalidated); [[fallthrough]]; case GFK_SystemOnly: - B = invalidateGlobalRegion(MemRegion::GlobalSystemSpaceRegionKind, - Ex, Count, LCtx, B, Invalidated); + B = invalidateGlobalRegion(MemRegion::GlobalSystemSpaceRegionKind, S, Count, + LCtx, B, Invalidated); [[fallthrough]]; case GFK_None: break; diff --git a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp index 7eca0579143f44..cb5fcbade2cfc2 100644 --- a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp +++ b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp @@ -174,7 +174,7 @@ DefinedOrUnknownSVal SValBuilder::conjureSymbolVal(const void *SymbolTag, } DefinedOrUnknownSVal SValBuilder::conjureSymbolVal(const void *symbolTag, - const Expr *expr, + const Stmt *St, const LocationContext *LCtx, QualType type, unsigned count) { @@ -184,7 +184,7 @@ DefinedOrUnknownSVal SValBuilder::conjureSymbolVal(const void *symbolTag, if (!SymbolManager::canSymbolicate(type)) return UnknownVal(); - SymbolRef sym = SymMgr.conjureSymbol(expr, LCtx, type, count, symbolTag); + SymbolRef sym = SymMgr.conjureSymbol(St, LCtx, type, count, symbolTag); if (Loc::isLocType(type)) return loc::MemRegionVal(MemMgr.getSymbolicRegion(sym)); diff --git a/compiler-rt/lib/rtsan/rtsan.cpp b/compiler-rt/lib/rtsan/rtsan.cpp index 2afdf3c76696e7..e6d2481b2c2a3d 100644 --- a/compiler-rt/lib/rtsan/rtsan.cpp +++ b/compiler-rt/lib/rtsan/rtsan.cpp @@ -22,11 +22,25 @@ using namespace __rtsan; using namespace __sanitizer; +namespace { +enum class InitializationState : u8 { + Uninitialized, + Initializing, + Initialized, +}; +} // namespace + static StaticSpinMutex rtsan_inited_mutex; static atomic_uint8_t rtsan_initialized = {0}; -static void SetInitialized() { - atomic_store(&rtsan_initialized, 1, memory_order_release); +static void SetInitializationState(InitializationState state) { + atomic_store(&rtsan_initialized, static_cast(state), + memory_order_release); +} + +static InitializationState GetInitializationState() { + return static_cast( + atomic_load(&rtsan_initialized, memory_order_acquire)); } static auto PrintDiagnosticsAndDieAction(DiagnosticsInfo info) { @@ -39,13 +53,14 @@ static auto PrintDiagnosticsAndDieAction(DiagnosticsInfo info) { extern "C" { SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_init() { - CHECK(!__rtsan_is_initialized()); + CHECK(GetInitializationState() == InitializationState::Uninitialized); + SetInitializationState(InitializationState::Initializing); SanitizerToolName = "RealtimeSanitizer"; InitializeFlags(); InitializeInterceptors(); - SetInitialized(); + SetInitializationState(InitializationState::Initialized); } SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_ensure_initialized() { @@ -62,7 +77,7 @@ SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_ensure_initialized() { } SANITIZER_INTERFACE_ATTRIBUTE bool __rtsan_is_initialized() { - return atomic_load(&rtsan_initialized, memory_order_acquire) == 1; + return GetInitializationState() == InitializationState::Initialized; } SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_realtime_enter() { @@ -83,11 +98,16 @@ SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_enable() { SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_notify_intercepted_call(const char *func_name) { + // While initializing, we need all intercepted functions to behave normally + if (GetInitializationState() == InitializationState::Initializing) + return; + __rtsan_ensure_initialized(); GET_CALLER_PC_BP; ExpectNotRealtime( GetContextForThisThread(), - PrintDiagnosticsAndDieAction({InterceptedCallInfo{func_name}, pc, bp})); + PrintDiagnosticsAndDieAction( + {DiagnosticsInfoType::InterceptedCall, func_name, pc, bp})); } SANITIZER_INTERFACE_ATTRIBUTE void @@ -96,7 +116,8 @@ __rtsan_notify_blocking_call(const char *func_name) { GET_CALLER_PC_BP; ExpectNotRealtime( GetContextForThisThread(), - PrintDiagnosticsAndDieAction({BlockingCallInfo{func_name}, pc, bp})); + PrintDiagnosticsAndDieAction( + {DiagnosticsInfoType::BlockingCall, func_name, pc, bp})); } } // extern "C" diff --git a/compiler-rt/lib/rtsan/rtsan_diagnostics.cpp b/compiler-rt/lib/rtsan/rtsan_diagnostics.cpp index ac13b0743be069..f82001f5b2057c 100644 --- a/compiler-rt/lib/rtsan/rtsan_diagnostics.cpp +++ b/compiler-rt/lib/rtsan/rtsan_diagnostics.cpp @@ -37,12 +37,6 @@ class Decorator : public __sanitizer::SanitizerCommonDecorator { const char *FunctionName() const { return Green(); } const char *Reason() const { return Blue(); } }; - -template struct Overloaded : Ts... { - using Ts::operator()...; -}; -// TODO: Remove below when c++20 -template Overloaded(Ts...) -> Overloaded; } // namespace static void PrintStackTrace(uptr pc, uptr bp) { @@ -53,35 +47,39 @@ static void PrintStackTrace(uptr pc, uptr bp) { } static void PrintError(const Decorator &decorator, - const DiagnosticsCallerInfo &info) { - const char *violation_type = std::visit( - Overloaded{ - [](const InterceptedCallInfo &) { return "unsafe-library-call"; }, - [](const BlockingCallInfo &) { return "blocking-call"; }}, - info); + const DiagnosticsInfo &info) { + const auto ErrorTypeStr = [&info]() -> const char * { + switch (info.type) { + case DiagnosticsInfoType::InterceptedCall: + return "unsafe-library-call"; + case DiagnosticsInfoType::BlockingCall: + return "blocking-call"; + } + return "(unknown error)"; + }; Printf("%s", decorator.Error()); - Report("ERROR: RealtimeSanitizer: %s\n", violation_type); + Report("ERROR: RealtimeSanitizer: %s\n", ErrorTypeStr()); } static void PrintReason(const Decorator &decorator, - const DiagnosticsCallerInfo &info) { + const DiagnosticsInfo &info) { Printf("%s", decorator.Reason()); - std::visit( - Overloaded{[decorator](const InterceptedCallInfo &call) { - Printf("Intercepted call to real-time unsafe function " - "`%s%s%s` in real-time context!", - decorator.FunctionName(), - call.intercepted_function_name_, decorator.Reason()); - }, - [decorator](const BlockingCallInfo &arg) { - Printf("Call to blocking function " - "`%s%s%s` in real-time context!", - decorator.FunctionName(), arg.blocking_function_name_, - decorator.Reason()); - }}, - info); + switch (info.type) { + case DiagnosticsInfoType::InterceptedCall: { + Printf("Intercepted call to real-time unsafe function " + "`%s%s%s` in real-time context!", + decorator.FunctionName(), info.func_name, decorator.Reason()); + break; + } + case DiagnosticsInfoType::BlockingCall: { + Printf("Call to blocking function " + "`%s%s%s` in real-time context!", + decorator.FunctionName(), info.func_name, decorator.Reason()); + break; + } + } Printf("\n"); } @@ -90,8 +88,8 @@ void __rtsan::PrintDiagnostics(const DiagnosticsInfo &info) { ScopedErrorReportLock l; Decorator d; - PrintError(d, info.call_info); - PrintReason(d, info.call_info); + PrintError(d, info); + PrintReason(d, info); Printf("%s", d.Default()); PrintStackTrace(info.pc, info.bp); } diff --git a/compiler-rt/lib/rtsan/rtsan_diagnostics.h b/compiler-rt/lib/rtsan/rtsan_diagnostics.h index 8aec512584b309..f8a6b8a954a24a 100644 --- a/compiler-rt/lib/rtsan/rtsan_diagnostics.h +++ b/compiler-rt/lib/rtsan/rtsan_diagnostics.h @@ -15,25 +15,16 @@ #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_internal_defs.h" -#include - namespace __rtsan { -struct InterceptedCallInfo { - const char *intercepted_function_name_; -}; - -struct BlockingCallInfo { -public: - const char *blocking_function_name_; +enum class DiagnosticsInfoType { + InterceptedCall, + BlockingCall, }; -using DiagnosticsCallerInfo = - std::variant; - struct DiagnosticsInfo { - DiagnosticsCallerInfo call_info; - + DiagnosticsInfoType type; + const char *func_name; __sanitizer::uptr pc; __sanitizer::uptr bp; }; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc index e29cc0c8b390ab..e3a329712ac5a3 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc @@ -1291,7 +1291,9 @@ INTERCEPTOR(int, prctl, int option, unsigned long arg2, unsigned long arg3, static const int PR_GET_PDEATHSIG = 2; static const int PR_SET_SECCOMP = 22; +# if !SANITIZER_ANDROID static const int SECCOMP_MODE_FILTER = 2; +# endif if (option == PR_SET_VMA && arg2 == 0UL) { char *name = (char *)arg5; COMMON_INTERCEPTOR_READ_RANGE(ctx, name, internal_strlen(name) + 1); @@ -1310,9 +1312,11 @@ INTERCEPTOR(int, prctl, int option, unsigned long arg2, unsigned long arg3, COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64 *)(arg5), sizeof(u64)); } else if (res != -1 && option == PR_GET_PDEATHSIG) { COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64 *)(arg2), sizeof(int)); +# if !SANITIZER_ANDROID } else if (res != -1 && option == PR_SET_SECCOMP && arg2 == SECCOMP_MODE_FILTER) { COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64 *)(arg3), struct_sock_fprog_sz); +# endif } return res; } diff --git a/compiler-rt/test/asan/lit.cfg.py b/compiler-rt/test/asan/lit.cfg.py index bc592c8be00337..ac29477cca2b81 100644 --- a/compiler-rt/test/asan/lit.cfg.py +++ b/compiler-rt/test/asan/lit.cfg.py @@ -153,12 +153,16 @@ def build_invocation(compile_flags, with_lto=False): if platform.system() == "Windows": # MSVC-specific tests might also use the clang-cl.exe driver. if target_is_msvc: - clang_cl_cxxflags = [ - "-Wno-deprecated-declarations", - "-WX", - "-D_HAS_EXCEPTIONS=0", - "-Zi", - ] + target_cflags + clang_cl_cxxflags = ( + [ + "-WX", + "-D_HAS_EXCEPTIONS=0", + ] + + config.debug_info_flags + + target_cflags + ) + if config.compiler_id != "MSVC": + clang_cl_cxxflags = ["-Wno-deprecated-declarations"] + clang_cl_cxxflags clang_cl_asan_cxxflags = ["-fsanitize=address"] + clang_cl_cxxflags if config.asan_dynamic: clang_cl_asan_cxxflags.append("-MD") @@ -286,6 +290,12 @@ def build_invocation(compile_flags, with_lto=False): [config.compiler_rt_libdir, os.environ.get("PATH", "")] ) +# msvc needs to be instructed where the compiler-rt libraries are +if config.compiler_id == "MSVC": + config.environment["LIB"] = os.path.pathsep.join( + [config.compiler_rt_libdir, config.environment.get("LIB", "")] + ) + # Default test suffixes. config.suffixes = [".c", ".cpp"] diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index 9fb89e6fd8d28a..b4cfe47f4505fd 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -240,6 +240,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.stdio.putchar libc.src.stdio.puts libc.src.stdio.remove + libc.src.stdio.rename libc.src.stdio.stderr libc.src.stdio.stdin libc.src.stdio.stdout diff --git a/libc/docs/gpu/support.rst b/libc/docs/gpu/support.rst index 44c21c7b4c1ff9..9c151a5fbac1f6 100644 --- a/libc/docs/gpu/support.rst +++ b/libc/docs/gpu/support.rst @@ -240,6 +240,7 @@ fputs |check| |check| fputc |check| |check| fwrite |check| |check| remove |check| |check| +rename |check| |check| putc |check| |check| printf |check| |check| vprintf |check| |check| diff --git a/libc/include/llvm-libc-types/rpc_opcodes_t.h b/libc/include/llvm-libc-types/rpc_opcodes_t.h index 3b388de6888c5d..1a6c0cd9bc4a14 100644 --- a/libc/include/llvm-libc-types/rpc_opcodes_t.h +++ b/libc/include/llvm-libc-types/rpc_opcodes_t.h @@ -38,6 +38,7 @@ typedef enum { RPC_PRINTF_TO_STDERR_PACKED, RPC_PRINTF_TO_STREAM_PACKED, RPC_REMOVE, + RPC_RENAME, RPC_SYSTEM, RPC_LAST = 0xFFFF, } rpc_opcode_t; diff --git a/libc/src/stdio/gpu/CMakeLists.txt b/libc/src/stdio/gpu/CMakeLists.txt index 86470b8425e956..9cac42ed71fb76 100644 --- a/libc/src/stdio/gpu/CMakeLists.txt +++ b/libc/src/stdio/gpu/CMakeLists.txt @@ -294,6 +294,17 @@ add_entrypoint_object( .vfprintf_utils ) +add_entrypoint_object( + rename + SRCS + rename.cpp + HDRS + ../rename.h + DEPENDS + libc.hdr.types.FILE + .gpu_file +) + add_entrypoint_object( stdin SRCS diff --git a/libc/src/stdio/gpu/rename.cpp b/libc/src/stdio/gpu/rename.cpp new file mode 100644 index 00000000000000..1087228835842e --- /dev/null +++ b/libc/src/stdio/gpu/rename.cpp @@ -0,0 +1,30 @@ +//===-- GPU Implementation of rename --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/stdio/rename.h" +#include "src/__support/CPP/string_view.h" +#include "src/__support/macros/config.h" +#include "src/stdio/gpu/file.h" + +#include "hdr/types/FILE.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, rename, (const char *oldpath, const char *newpath)) { + int ret; + rpc::Client::Port port = rpc::client.open(); + port.send_n(oldpath, internal::string_length(oldpath) + 1); + port.send_n(newpath, internal::string_length(newpath) + 1); + port.recv( + [&](rpc::Buffer *buffer) { ret = static_cast(buffer->data[0]); }); + port.close(); + + return ret; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp index 8708f946b310ee..aa65dfe69c385c 100644 --- a/libc/utils/gpu/server/rpc_server.cpp +++ b/libc/utils/gpu/server/rpc_server.cpp @@ -392,6 +392,24 @@ rpc_status_t handle_server_impl( }); break; } + case RPC_RENAME: { + uint64_t oldsizes[lane_size] = {0}; + uint64_t newsizes[lane_size] = {0}; + void *oldpath[lane_size] = {nullptr}; + void *newpath[lane_size] = {nullptr}; + port->recv_n(oldpath, oldsizes, + [&](uint64_t size) { return new char[size]; }); + port->recv_n(newpath, newsizes, + [&](uint64_t size) { return new char[size]; }); + port->send([&](rpc::Buffer *buffer, uint32_t id) { + buffer->data[0] = static_cast( + rename(reinterpret_cast(oldpath[id]), + reinterpret_cast(newpath[id]))); + delete[] reinterpret_cast(oldpath[id]); + delete[] reinterpret_cast(newpath[id]); + }); + break; + } case RPC_SYSTEM: { uint64_t sizes[lane_size] = {0}; void *args[lane_size] = {nullptr}; diff --git a/lld/test/wasm/unsupported-pic-relocations.s b/lld/test/wasm/unsupported-pic-relocations.s index ea32e8468cdb4d..2f85afa02c88b1 100644 --- a/lld/test/wasm/unsupported-pic-relocations.s +++ b/lld/test/wasm/unsupported-pic-relocations.s @@ -15,6 +15,10 @@ # RUN: not wasm-ld --experimental-pic -shared %t.o -o /dev/null --unresolved-symbols=import-dynamic 2>&1 | \ # RUN: FileCheck %s +## These errors should not be reported under -r/--relocation (i.e. when +## generating an object file) +# RUN: wasm-ld --experimental-pic -r %t.o -o /dev/null + .functype external_func () -> () use_undefined_function: @@ -23,7 +27,7 @@ use_undefined_function: # CHECK: error: {{.*}}.o: relocation R_WASM_TABLE_INDEX_REL_SLEB is not supported against an undefined symbol `external_func` drop end_function - + use_undefined_data: .functype use_undefined_data () -> () i32.const external_data@MBREL diff --git a/lld/test/wasm/unsupported-pic-relocations64.s b/lld/test/wasm/unsupported-pic-relocations64.s index db9707b7fbac5e..df885b8d75fbe8 100644 --- a/lld/test/wasm/unsupported-pic-relocations64.s +++ b/lld/test/wasm/unsupported-pic-relocations64.s @@ -15,6 +15,10 @@ # RUN: not wasm-ld -mwasm64 --experimental-pic -shared %t.o -o /dev/null --unresolved-symbols=import-dynamic 2>&1 | \ # RUN: FileCheck %s +## These errors should not be reported under -r/--relocation (i.e. when +## generating an object file) +# RUN: wasm-ld -mwasm64 --experimental-pic -r %t.o -o /dev/null + .functype external_func () -> () use_undefined_function: @@ -23,7 +27,7 @@ use_undefined_function: # CHECK: error: {{.*}}.o: relocation R_WASM_TABLE_INDEX_REL_SLEB64 is not supported against an undefined symbol `external_func` drop end_function - + use_undefined_data: .functype use_undefined_data () -> () i64.const external_data@MBREL diff --git a/lld/wasm/Relocations.cpp b/lld/wasm/Relocations.cpp index 2dbfe335494711..45ad32701616a1 100644 --- a/lld/wasm/Relocations.cpp +++ b/lld/wasm/Relocations.cpp @@ -173,7 +173,7 @@ void scanRelocations(InputChunk *chunk) { } } - if (sym->isUndefined()) { + if (!config->relocatable && sym->isUndefined()) { switch (reloc.Type) { case R_WASM_TABLE_INDEX_REL_SLEB: case R_WASM_TABLE_INDEX_REL_SLEB64: @@ -187,11 +187,11 @@ void scanRelocations(InputChunk *chunk) { toString(*sym) + "`"); break; } - } - if (sym->isUndefined() && !config->relocatable && !sym->isWeak()) { - // Report undefined symbols - reportUndefined(file, sym); + if (!sym->isWeak()) { + // Report undefined symbols + reportUndefined(file, sym); + } } } } diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig index 961fb2d1a76178..b72a462d04643b 100644 --- a/lldb/bindings/python/python-wrapper.swig +++ b/lldb/bindings/python/python-wrapper.swig @@ -667,6 +667,79 @@ lldb_private::python::SWIGBridge::LLDBSwigPythonGetRepeatCommandForScriptedComma return result.Str().GetString().str(); } +StructuredData::DictionarySP +lldb_private::python::SWIGBridge::LLDBSwigPythonHandleArgumentCompletionForScriptedCommand(PyObject *implementor, + std::vector &args_vec, size_t args_pos, size_t pos_in_arg) { + + PyErr_Cleaner py_err_cleaner(true); + + PythonObject self(PyRefType::Borrowed, implementor); + auto pfunc = self.ResolveName("handle_argument_completion"); + // If this isn't implemented, return an empty dict to signal falling back to default completion: + if (!pfunc.IsAllocated()) + return {}; + + PythonList args_list(PyInitialValue::Empty); + for (auto elem : args_vec) + args_list.AppendItem(PythonString(elem)); + + PythonObject result = pfunc(args_list, PythonInteger(args_pos), PythonInteger(pos_in_arg)); + // Returning None means do the ordinary completion + if (result.IsNone()) + return {}; + + // Convert the return dictionary to a DictionarySP. + StructuredData::ObjectSP result_obj_sp = result.CreateStructuredObject(); + if (!result_obj_sp) + return {}; + + StructuredData::DictionarySP dict_sp(new StructuredData::Dictionary(result_obj_sp)); + if (dict_sp->GetType() == lldb::eStructuredDataTypeInvalid) + return {}; + return dict_sp; +} + +StructuredData::DictionarySP +lldb_private::python::SWIGBridge::LLDBSwigPythonHandleOptionArgumentCompletionForScriptedCommand(PyObject *implementor, + llvm::StringRef &long_option, size_t pos_in_arg) { + + PyErr_Cleaner py_err_cleaner(true); + + PythonObject self(PyRefType::Borrowed, implementor); + auto pfunc = self.ResolveName("handle_option_argument_completion"); + // If this isn't implemented, return an empty dict to signal falling back to default completion: + if (!pfunc.IsAllocated()) + return {}; + + PythonObject result = pfunc(PythonString(long_option), PythonInteger(pos_in_arg)); + // Returning None means do the ordinary completion + if (result.IsNone()) + return {}; + + // Returning a boolean: + // True means the completion was handled, but there were no completions + // False means that the completion was not handled, again, do the ordinary completion: + if (result.GetObjectType() == PyObjectType::Boolean) { + if (!result.IsTrue()) + return {}; + // Make up a completion dictionary with the right element: + StructuredData::DictionarySP dict_sp(new StructuredData::Dictionary()); + dict_sp->AddBooleanItem("no-completion", true); + return dict_sp; + } + + + // Convert the return dictionary to a DictionarySP. + StructuredData::ObjectSP result_obj_sp = result.CreateStructuredObject(); + if (!result_obj_sp) + return {}; + + StructuredData::DictionarySP dict_sp(new StructuredData::Dictionary(result_obj_sp)); + if (dict_sp->GetType() == lldb::eStructuredDataTypeInvalid) + return {}; + return dict_sp; +} + #include "lldb/Interpreter/CommandReturnObject.h" bool lldb_private::python::SWIGBridge::LLDBSwigPythonCallParsedCommandObject( diff --git a/lldb/docs/use/python-reference.rst b/lldb/docs/use/python-reference.rst index b12048f1af067d..95a6020ca3e455 100644 --- a/lldb/docs/use/python-reference.rst +++ b/lldb/docs/use/python-reference.rst @@ -551,7 +551,7 @@ command definition form can't do the right thing. Since lldb 3.7, Python commands can also be implemented by means of a class which should implement the following interface: -:: +.. code-block:: python class CommandObjectType: def __init__(self, debugger, internal_dict): @@ -586,20 +586,193 @@ which should implement the following interface: As a convenience, you can treat the result object as a Python file object, and say -:: +.. code-block:: python print >>result, "my command does lots of cool stuff" SBCommandReturnObject and SBStream both support this file-like behavior by providing write() and flush() calls at the Python layer. +The commands that are added using this class definition are what lldb calls +"raw" commands. The command interpreter doesn't attempt to parse the command, +doesn't handle option values, neither generating help for them, or their +completion. Raw commands are useful when the arguments passed to the command +are unstructured, and having to protect them against lldb command parsing would +be onerous. For instance, "expr" is a raw command. + +You can also add scripted commands that implement the "parsed command", where +the options and their types are specified, as well as the argument and argument +types. These commands look and act like the majority of lldb commands, and you +can also add custom completions for the options and/or the arguments if you have +special needs. + +The easiest way to do this is to derive your new command from the lldb.ParsedCommand +class. That responds in the same way to the help & repeat command interfaces, and +provides some convenience methods, and most importantly an LLDBOptionValueParser, +accessed throught lldb.ParsedCommand.get_parser(). The parser is used to set +your command definitions, and to retrieve option values in the __call__ method. + +To set up the command definition, implement the ParsedCommand abstract method: + +.. code-block:: python + + def setup_command_definition(self): + +This is called when your command is added to lldb. In this method you add the +options and their types, the option help strings, etc. to the command using the API: + +.. code-block:: python + + def add_option(self, short_option, long_option, help, default, + dest = None, required=False, groups = None, + value_type=lldb.eArgTypeNone, completion_type=None, + enum_values=None): + """ + short_option: one character, must be unique, not required + long_option: no spaces, must be unique, required + help: a usage string for this option, will print in the command help + default: the initial value for this option (if it has a value) + dest: the name of the property that gives you access to the value for + this value. Defaults to the long option if not provided. + required: if true, this option must be provided or the command will error out + groups: Which "option groups" does this option belong to. This can either be + a simple list (e.g. [1, 3, 4, 5]) or you can specify ranges by sublists: + so [1, [3,5]] is the same as [1, 3, 4, 5]. + value_type: one of the lldb.eArgType enum values. Some of the common arg + types also have default completers, which will be applied automatically. + completion_type: currently these are values form the lldb.CompletionType enum. If + you need custom completions, implement handle_option_argument_completion. + enum_values: An array of duples: ["element_name", "element_help"]. If provided, + only one of the enum elements is allowed. The value will be the + element_name for the chosen enum element as a string. + """ + +Similarly, you can add argument types to the command: + +.. code-block:: python + + def make_argument_element(self, arg_type, repeat = "optional", groups = None): + """ + arg_type: The argument type, one of the lldb.eArgType enum values. + repeat: Choose from the following options: + "plain" - one value + "optional" - zero or more values + "plus" - one or more values + groups: As with add_option. + """ + +Then implement the body of the command by defining: + +.. code-block:: python + + def __call__(self, debugger, args_array, exe_ctx, result): + """This is the command callback. The option values are + provided by the 'dest' properties on the parser. + + args_array: This is the list of arguments provided. + exe_ctx: Gives the SBExecutionContext on which the + command should operate. + result: Any results of the command should be + written into this SBCommandReturnObject. + """ + +This differs from the "raw" command's __call__ in that the arguments are already +parsed into the args_array, and the option values are set in the parser, and +can be accessed using their property name. The LLDBOptionValueParser class has +a couple of other handy methods: + +.. code-block:: python + def was_set(self, long_option_name): + +returns True if the option was specified on the command line. + +.. code-block:: python + + def dest_for_option(self, long_option_name): + """ + This will return the value of the dest variable you defined for opt_name. + Mostly useful for handle_completion where you get passed the long option. + """ + +lldb will handle completing your option names, and all your enum values +automatically. If your option or argument types have associated built-in completers, +then lldb will also handle that completion for you. But if you have a need for +custom completions, either in your arguments or option values, you can handle +completion by hand as well. To handle completion of option value arguments, +your lldb.ParsedCommand subclass should implement: + +.. code-block:: python + + def handle_option_argument_completion(self, long_option, cursor_pos): + """ + long_option: The long option name of the option whose value you are + asked to complete. + cursor_pos: The cursor position in the value for that option - which + you can get from the option parser. + """ + +And to handle the completion of arguments: + +.. code-block:: python + + def handle_argument_completion(self, args, arg_pos, cursor_pos): + """ + args: A list of the arguments to the command + arg_pos: An index into the args list of the argument with the cursor + cursor_pos: The cursor position in the arg specified by arg_pos + """ + +When either of these API's is called, the command line will have been parsed up to +the word containing the cursor, and any option values set in that part of the command +string are available from the option value parser. That's useful for instance +if you have a --shared-library option that would constrain the completions for, +say, a symbol name option or argument. + +The return value specifies what the completion options are. You have four +choices: + +- `True`: the completion was handled with no completions. + +- `False`: the completion was not handled, forward it to the regular +completion machinery. + +- A dictionary with the key: "completion": there is one candidate, +whose value is the value of the "completion" key. Optionally you can pass a +"mode" key whose value is either "partial" or "complete". Return partial if +the "completion" string is a prefix for all the completed value. + +For instance, if the string you are completing is "Test" and the available completions are: +"Test1", "Test11" and "Test111", you should return the dictionary: + +.. code-block:: python + + return {"completion": "Test1", "mode" : "partial"} + +and then lldb will add the "1" at the curson and advance it after the added string, +waiting for more completions. But if "Test1" is the only completion, return: + +.. code-block:: python + + {"completion": "Test1", "mode": "complete"} + +and lldb will add "1 " at the cursor, indicating the command string is complete. + +The default is "complete", you don't need to specify a "mode" in that case. + +- A dictionary with the key: "values" whose value is a list of candidate completion +strings. The command interpreter will present those strings as the available choices. +You can optionally include a "descriptions" key, whose value is a parallel array +of description strings, and the completion will show the description next to +each completion. + + One other handy convenience when defining lldb command-line commands is the -command command script import which will import a module specified by file +command "command script import" which will import a module specified by file path, so you don't have to change your PYTHONPATH for temporary scripts. It also has another convenience that if your new script module has a function of the form: -:: +.. code-block python def __lldb_init_module(debugger, internal_dict): # Command Initialization code goes here @@ -615,7 +788,7 @@ creating scripts that can be run from the command line. However, for command line scripts, the debugger instance must be created manually. Sample code would look like: -:: +.. code-block:: python if __name__ == '__main__': # Initialize the debugger before making any API calls. @@ -638,7 +811,7 @@ look like: Now we can create a module called ls.py in the file ~/ls.py that will implement a function that can be used by LLDB's python command code: -:: +.. code-block:: python #!/usr/bin/env python diff --git a/lldb/examples/python/cmdtemplate.py b/lldb/examples/python/cmdtemplate.py index b6a21cba7113e6..a9fbe0b40e1957 100644 --- a/lldb/examples/python/cmdtemplate.py +++ b/lldb/examples/python/cmdtemplate.py @@ -29,8 +29,8 @@ def get_flags(self): return lldb.eCommandRequiresFrame | lldb.eCommandProcessMustBePaused def setup_command_definition(self): - - self.ov_parser.add_option( + ov_parser = self.get_parser() + ov_parser.add_option( "i", "in-scope", help = "in_scope_only = True", @@ -39,7 +39,7 @@ def setup_command_definition(self): default = True, ) - self.ov_parser.add_option( + ov_parser.add_option( "i", "in-scope", help = "in_scope_only = True", @@ -48,7 +48,7 @@ def setup_command_definition(self): default=True, ) - self.ov_parser.add_option( + ov_parser.add_option( "a", "arguments", help = "arguments = True", @@ -57,7 +57,7 @@ def setup_command_definition(self): default = True, ) - self.ov_parser.add_option( + ov_parser.add_option( "l", "locals", help = "locals = True", @@ -66,7 +66,7 @@ def setup_command_definition(self): default = True, ) - self.ov_parser.add_option( + ov_parser.add_option( "s", "statics", help = "statics = True", @@ -103,8 +103,9 @@ def __call__(self, debugger, command, exe_ctx, result): result.SetError("invalid frame") return + ov_parser = self.get_parser() variables_list = frame.GetVariables( - self.ov_parser.arguments, self.ov_parser.locals, self.ov_parser.statics, self.ov_parser.inscope + ov_parser.arguments, ov_parser.locals, ov_parser.statics, ov_parser.inscope ) variables_count = variables_list.GetSize() if variables_count == 0: diff --git a/lldb/examples/python/templates/parsed_cmd.py b/lldb/examples/python/templates/parsed_cmd.py index 06124adf43420a..13d6eae405c08d 100644 --- a/lldb/examples/python/templates/parsed_cmd.py +++ b/lldb/examples/python/templates/parsed_cmd.py @@ -4,7 +4,8 @@ The way to use it is to make a class for your command that inherits from ParsedCommandBase. That will make an LLDBOptionValueParser which you will use for your option definition, and to fetch option values for the current invocation -of your command. Access to the OV parser is through: +of your command. For concision, I'll call this the `OVParser`. +Access to the `OVParser` is through: ParsedCommandBase.get_parser() @@ -43,7 +44,65 @@ def __call__(self, debugger, args_list, exe_ctx, result): will return True if the user set this option, and False if it was left at its default value. -There are example commands in the lldb testsuite at: +Custom Completions: + +You can also implement custom completers for your custom command, either for the +arguments to your command or to the option values in your command. If you use enum +values or if your option/argument uses is one of the types we have completers for, +you should not need to do this. But if you have your own completeable types, or if +you want completion of one option to be conditioned by other options on the command +line, you can use this interface to take over the completion. + +You can choose to add a completion for the option values defined for your command, +or for the arguments, separately. For the option values, define: + +def handle_option_argument_completion(self, long_option, cursor_pos): + +The line to be completed will be parsed up to the option containint the cursor position, +and the values will be set in the OptionValue parser object. long_option will be +the option name containing the cursor, and cursor_pos will be the position of the cursor +in that option's value. You can call the `OVParser` method: `dest_for_option(long_option)` +to get the value for that option. The other options that came before the cursor in the command +line will also be set in the `OVParser` when the completion handler is called. + +For argument values, define: + +def handle_argument_completion(self, args, arg_pos, cursor_pos): + +Again, the command line will be parsed up to the cursor position, and all the options +before the cursor pose will be set in the `OVParser`. args is a python list of the +arguments, arg_pos is the index of the argument with the cursor, and cursor_pos is +the position of the cursor in the argument. + +In both cases, the return value determines the completion. + +Return False to mean "Not Handled" - in which case lldb will fall back on the +standard completion machinery. + +Return True to mean "Handled with no completions". + +If there is a single unique completion, return a Python dictionary with two elements: + +return {"completion" : "completed_value", "mode" : <"partial", "complete">} + +If the mode is "partial", then the completion is to a common base, if it is "complete" +then the argument is considered done - mostly meaning lldb will put a space after the +completion string. "complete" is the default if no "mode" is specified. + +If there are multiple completion options, then return: + +return {"values" : ["option1", "option2"]} + +Optionally, you can return a parallel array of "descriptions" which the completer will +print alongside the options: + +return {"values" : ["option1", "option2"], "descriptions" : ["the first option", "the second option"]} + +The cmdtemplate example currently uses the parsed command infrastructure: + +llvm-project/lldb/examples/python/cmdtemplate.py + +There are also a few example commands in the lldb testsuite at: llvm-project/lldb/test/API/commands/command/script/add/test_commands.py """ @@ -226,10 +285,14 @@ def set_option_value(self, exe_ctx, opt_name, opt_value): return True def was_set(self, opt_name): - """ Call this in the __call__ method of your command to determine - whether this option was set on the command line. It is sometimes - useful to know whether an option has the default value because the - user set it explicitly (was_set -> True) or not. """ + """Call this in the __call__ method of your command to determine + whether this option was set on the command line. It is sometimes + useful to know whether an option has the default value because the + user set it explicitly (was_set -> True) or not. + You can also call this in a handle_completion method, but it will + currently only report true values for the options mentioned + BEFORE the cursor point in the command line. + """ elem = self.get_option_element(opt_name) if not elem: @@ -239,6 +302,16 @@ def was_set(self, opt_name): except AttributeError: return False + def dest_for_option(self, opt_name): + """This will return the value of the dest variable you defined for opt_name. + Mostly useful for handle_completion where you get passed the long option. + """ + elem = self.get_option_element(opt_name) + if not elem: + return None + value = self.__dict__[elem["dest"]] + return value + def add_option(self, short_option, long_option, help, default, dest = None, required=False, groups = None, value_type=lldb.eArgTypeNone, completion_type=None, @@ -251,14 +324,16 @@ def add_option(self, short_option, long_option, help, default, dest: the name of the property that gives you access to the value for this value. Defaults to the long option if not provided. required: if true, this option must be provided or the command will error out - groups: Which "option groups" does this option belong to + groups: Which "option groups" does this option belong to. This can either be + a simple list (e.g. [1, 3, 4, 5]) or you can specify ranges by sublists: + so [1, [3,5]] is the same as [1, 3, 4, 5]. value_type: one of the lldb.eArgType enum values. Some of the common arg types also have default completers, which will be applied automatically. - completion_type: currently these are values form the lldb.CompletionType enum, I - haven't done custom completions yet. + completion_type: currently these are values form the lldb.CompletionType enum. If + you need custom completions, implement handle_option_argument_completion. enum_values: An array of duples: ["element_name", "element_help"]. If provided, - only one of the enum elements is allowed. The value will be the - element_name for the chosen enum element as a string. + only one of the enum elements is allowed. The value will be the + element_name for the chosen enum element as a string. """ if not dest: dest = long_option diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h index 901ecf3012d51d..2c2bd6f232e094 100644 --- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h +++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h @@ -420,6 +420,20 @@ class ScriptInterpreter : public PluginInterface { return std::nullopt; } + virtual StructuredData::DictionarySP + HandleArgumentCompletionForScriptedCommand( + StructuredData::GenericSP impl_obj_sp, std::vector &args, + size_t args_pos, size_t char_in_arg) { + return {}; + } + + virtual StructuredData::DictionarySP + HandleOptionArgumentCompletionForScriptedCommand( + StructuredData::GenericSP impl_obj_sp, llvm::StringRef &long_name, + size_t char_in_arg) { + return {}; + } + virtual bool RunScriptFormatKeyword(const char *impl_function, Process *process, std::string &output, Status &error) { diff --git a/lldb/include/lldb/Symbol/UnwindPlan.h b/lldb/include/lldb/Symbol/UnwindPlan.h index a1d00f2d2c0cd1..e1567c7357d0b5 100644 --- a/lldb/include/lldb/Symbol/UnwindPlan.h +++ b/lldb/include/lldb/Symbol/UnwindPlan.h @@ -370,6 +370,13 @@ class UnwindPlan { bool SetRegisterLocationToSame(uint32_t reg_num, bool must_replace); + /// This method does not make a copy of the \a opcodes memory, it is + /// assumed to have the same lifetime as the Module this UnwindPlan will + /// be registered in. + bool SetRegisterLocationToIsDWARFExpression(uint32_t reg_num, + const uint8_t *opcodes, + uint32_t len, bool can_replace); + bool SetRegisterLocationToIsConstant(uint32_t reg_num, uint64_t constant, bool can_replace); diff --git a/lldb/include/lldb/Utility/CompletionRequest.h b/lldb/include/lldb/Utility/CompletionRequest.h index 1a2b1d639950fc..650158a197dbd9 100644 --- a/lldb/include/lldb/Utility/CompletionRequest.h +++ b/lldb/include/lldb/Utility/CompletionRequest.h @@ -139,6 +139,8 @@ class CompletionRequest { return GetParsedLine()[GetCursorIndex()]; } + size_t GetCursorCharPos() const { return m_cursor_char_position; } + /// Drops the first argument from the argument list. void ShiftArguments() { m_cursor_index--; diff --git a/lldb/source/Commands/CommandObjectCommands.cpp b/lldb/source/Commands/CommandObjectCommands.cpp index e3291640fa9352..845b89a75b7b39 100644 --- a/lldb/source/Commands/CommandObjectCommands.cpp +++ b/lldb/source/Commands/CommandObjectCommands.cpp @@ -1637,6 +1637,129 @@ class CommandObjectScriptingObjectParsed : public CommandObjectParsed { size_t GetNumOptions() { return m_num_options; } + void PrepareOptionsForCompletion(CompletionRequest &request, + OptionElementVector &option_vec, + ExecutionContext *exe_ctx) { + // I'm not sure if we'll get into trouble doing an option parsing start + // and end in this context. If so, then I'll have to directly tell the + // scripter to do this. + OptionParsingStarting(exe_ctx); + auto opt_defs = GetDefinitions(); + + // Iterate through the options we found so far, and push them into + // the scripted side. + for (auto option_elem : option_vec) { + int cur_defs_index = option_elem.opt_defs_index; + // If we don't recognize this option we can't set it. + if (cur_defs_index == OptionArgElement::eUnrecognizedArg || + cur_defs_index == OptionArgElement::eBareDash || + cur_defs_index == OptionArgElement::eBareDoubleDash) + continue; + bool option_has_arg = opt_defs[cur_defs_index].option_has_arg; + llvm::StringRef cur_arg_value; + if (option_has_arg) { + int cur_arg_pos = option_elem.opt_arg_pos; + if (cur_arg_pos != OptionArgElement::eUnrecognizedArg && + cur_arg_pos != OptionArgElement::eBareDash && + cur_arg_pos != OptionArgElement::eBareDoubleDash) { + cur_arg_value = + request.GetParsedLine().GetArgumentAtIndex(cur_arg_pos); + } + } + SetOptionValue(cur_defs_index, cur_arg_value, exe_ctx); + } + OptionParsingFinished(exe_ctx); + } + + void + ProcessCompletionDict(CompletionRequest &request, + StructuredData::DictionarySP &completion_dict_sp) { + // We don't know how to process an empty completion dict, our callers have + // to do that. + assert(completion_dict_sp && "Must have valid completion dict"); + // First handle the case of a single completion: + llvm::StringRef completion; + // If the dictionary has one element "no-completion" then we return here + if (completion_dict_sp->GetValueForKeyAsString("no-completion", + completion)) + return; + + if (completion_dict_sp->GetValueForKeyAsString("completion", + completion)) { + llvm::StringRef mode_str; + CompletionMode mode = CompletionMode::Normal; + if (completion_dict_sp->GetValueForKeyAsString("mode", mode_str)) { + if (mode_str == "complete") + mode = CompletionMode::Normal; + else if (mode_str == "partial") + mode = CompletionMode::Partial; + else { + // FIXME - how do I report errors here? + return; + } + } + request.AddCompletion(completion, "", mode); + return; + } + // The completions are required, the descriptions are not: + StructuredData::Array *completions; + StructuredData::Array *descriptions; + if (completion_dict_sp->GetValueForKeyAsArray("values", completions)) { + completion_dict_sp->GetValueForKeyAsArray("descriptions", descriptions); + size_t num_completions = completions->GetSize(); + for (size_t idx = 0; idx < num_completions; idx++) { + auto val = completions->GetItemAtIndexAsString(idx); + if (!val) + // FIXME: How do I report this error? + return; + + if (descriptions) { + auto desc = descriptions->GetItemAtIndexAsString(idx); + request.AddCompletion(*val, desc ? *desc : ""); + } else + request.AddCompletion(*val); + } + } + } + + void + HandleOptionArgumentCompletion(lldb_private::CompletionRequest &request, + OptionElementVector &option_vec, + int opt_element_index, + CommandInterpreter &interpreter) override { + ScriptInterpreter *scripter = + interpreter.GetDebugger().GetScriptInterpreter(); + + if (!scripter) + return; + + ExecutionContext exe_ctx = interpreter.GetExecutionContext(); + PrepareOptionsForCompletion(request, option_vec, &exe_ctx); + + auto defs = GetDefinitions(); + + size_t defs_index = option_vec[opt_element_index].opt_defs_index; + llvm::StringRef option_name = defs[defs_index].long_option; + bool is_enum = defs[defs_index].enum_values.size() != 0; + if (option_name.empty()) + return; + // If this is an enum, we don't call the custom completer, just let the + // regular option completer handle that: + StructuredData::DictionarySP completion_dict_sp; + if (!is_enum) + completion_dict_sp = + scripter->HandleOptionArgumentCompletionForScriptedCommand( + m_cmd_obj_sp, option_name, request.GetCursorCharPos()); + + if (!completion_dict_sp) { + Options::HandleOptionArgumentCompletion(request, option_vec, + opt_element_index, interpreter); + return; + } + + ProcessCompletionDict(request, completion_dict_sp); + } + private: struct EnumValueStorage { EnumValueStorage() { @@ -1878,6 +2001,74 @@ class CommandObjectScriptingObjectParsed : public CommandObjectParsed { Status GetArgsError() { return m_args_error.Clone(); } bool WantsCompletion() override { return true; } +private: + void PrepareOptionsForCompletion(CompletionRequest &request, + OptionElementVector &option_vec) { + // First, we have to tell the Scripted side to set the values in its + // option store, then we call into the handle_completion passing in + // an array of the args, the arg index and the cursor position in the arg. + // We want the script side to have a chance to clear its state, so tell + // it argument parsing has started: + Options *options = GetOptions(); + // If there are not options, this will be nullptr, and in that case we + // can just skip setting the options on the scripted side: + if (options) + m_options.PrepareOptionsForCompletion(request, option_vec, &m_exe_ctx); + } + +public: + void HandleArgumentCompletion(CompletionRequest &request, + OptionElementVector &option_vec) override { + ScriptInterpreter *scripter = GetDebugger().GetScriptInterpreter(); + + if (!scripter) + return; + + // Set up the options values on the scripted side: + PrepareOptionsForCompletion(request, option_vec); + + // Now we have to make up the argument list. + // The ParseForCompletion only identifies tokens in the m_parsed_line + // it doesn't remove the options leaving only the args as it does for + // the regular Parse, so we have to filter out the option ones using the + // option_element_vector: + + Options *options = GetOptions(); + auto defs = options->GetDefinitions(); + + std::unordered_set option_slots; + for (const auto &elem : option_vec) { + if (elem.opt_defs_index == -1) + continue; + option_slots.insert(elem.opt_pos); + if (defs[elem.opt_defs_index].option_has_arg) + option_slots.insert(elem.opt_arg_pos); + } + + std::vector args_vec; + Args &args = request.GetParsedLine(); + size_t num_args = args.GetArgumentCount(); + size_t cursor_idx = request.GetCursorIndex(); + size_t args_elem_pos = cursor_idx; + + for (size_t idx = 0; idx < num_args; idx++) { + if (option_slots.count(idx) == 0) + args_vec.push_back(args[idx].ref()); + else if (idx < cursor_idx) + args_elem_pos--; + } + StructuredData::DictionarySP completion_dict_sp = + scripter->HandleArgumentCompletionForScriptedCommand( + m_cmd_obj_sp, args_vec, args_elem_pos, request.GetCursorCharPos()); + + if (!completion_dict_sp) { + CommandObject::HandleArgumentCompletion(request, option_vec); + return; + } + + m_options.ProcessCompletionDict(request, completion_dict_sp); + } + bool IsRemovable() const override { return true; } ScriptedCommandSynchronicity GetSynchronicity() { return m_synchro; } diff --git a/lldb/source/Interpreter/Options.cpp b/lldb/source/Interpreter/Options.cpp index b8a3f68a49b1cf..3888a5812628cd 100644 --- a/lldb/source/Interpreter/Options.cpp +++ b/lldb/source/Interpreter/Options.cpp @@ -661,7 +661,9 @@ bool Options::HandleOptionCompletion(CompletionRequest &request, } else if (opt_arg_pos == request.GetCursorIndex()) { // Okay the cursor is on the completion of an argument. See if it has a - // completion, otherwise return no matches. + // completion, otherwise return no matches. Note, opt_defs_index == -1 + // means we're after an option, but that option doesn't exist. We'll + // end up treating that as an argument. Not sure we can do much better. if (opt_defs_index != -1) { HandleOptionArgumentCompletion(request, opt_element_vector, i, interpreter); @@ -688,7 +690,6 @@ void Options::HandleOptionArgumentCompletion( int opt_defs_index = opt_element_vector[opt_element_index].opt_defs_index; // See if this is an enumeration type option, and if so complete it here: - const auto &enum_values = opt_defs[opt_defs_index].enum_values; if (!enum_values.empty()) for (const auto &enum_value : enum_values) diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h index 81ee9ea0a2fa10..518a478af5f6a8 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h @@ -200,6 +200,15 @@ class SWIGBridge { LLDBSwigPythonGetRepeatCommandForScriptedCommand(PyObject *implementor, std::string &command); + static StructuredData::DictionarySP + LLDBSwigPythonHandleArgumentCompletionForScriptedCommand( + PyObject *implementor, std::vector &args_impl, + size_t args_pos, size_t pos_in_arg); + + static StructuredData::DictionarySP + LLDBSwigPythonHandleOptionArgumentCompletionForScriptedCommand( + PyObject *implementor, llvm::StringRef &long_option, size_t pos_in_arg); + static bool LLDBSwigPythonCallModuleInit(const char *python_module_name, const char *session_dictionary_name, lldb::DebuggerSP debugger); diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp index 155efc06eaf41a..db1a10e73a66ab 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp @@ -2720,6 +2720,46 @@ ScriptInterpreterPythonImpl::GetRepeatCommandForScriptedCommand( return ret_val; } +StructuredData::DictionarySP +ScriptInterpreterPythonImpl::HandleArgumentCompletionForScriptedCommand( + StructuredData::GenericSP impl_obj_sp, std::vector &args, + size_t args_pos, size_t char_in_arg) { + StructuredData::DictionarySP completion_dict_sp; + if (!impl_obj_sp || !impl_obj_sp->IsValid()) + return completion_dict_sp; + + { + Locker py_lock(this, Locker::AcquireLock | Locker::NoSTDIN, + Locker::FreeLock); + + completion_dict_sp = + SWIGBridge::LLDBSwigPythonHandleArgumentCompletionForScriptedCommand( + static_cast(impl_obj_sp->GetValue()), args, args_pos, + char_in_arg); + } + return completion_dict_sp; +} + +StructuredData::DictionarySP +ScriptInterpreterPythonImpl::HandleOptionArgumentCompletionForScriptedCommand( + StructuredData::GenericSP impl_obj_sp, llvm::StringRef &long_option, + size_t char_in_arg) { + StructuredData::DictionarySP completion_dict_sp; + if (!impl_obj_sp || !impl_obj_sp->IsValid()) + return completion_dict_sp; + + { + Locker py_lock(this, Locker::AcquireLock | Locker::NoSTDIN, + Locker::FreeLock); + + completion_dict_sp = SWIGBridge:: + LLDBSwigPythonHandleOptionArgumentCompletionForScriptedCommand( + static_cast(impl_obj_sp->GetValue()), long_option, + char_in_arg); + } + return completion_dict_sp; +} + /// In Python, a special attribute __doc__ contains the docstring for an object /// (function, method, class, ...) if any is defined Otherwise, the attribute's /// value is None. diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h index d15e2fd76f683b..2dc784777151bb 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h @@ -166,6 +166,14 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython { GetRepeatCommandForScriptedCommand(StructuredData::GenericSP impl_obj_sp, Args &args) override; + StructuredData::DictionarySP HandleArgumentCompletionForScriptedCommand( + StructuredData::GenericSP impl_obj_sp, std::vector &args, + size_t args_pos, size_t char_in_arg) override; + + StructuredData::DictionarySP HandleOptionArgumentCompletionForScriptedCommand( + StructuredData::GenericSP impl_obj_sp, llvm::StringRef &long_options, + size_t char_in_arg) override; + Status GenerateFunction(const char *signature, const StringList &input, bool is_callback) override; diff --git a/lldb/test/API/commands/command/script/add/TestAddParsedCommand.py b/lldb/test/API/commands/command/script/add/TestAddParsedCommand.py index c7680e9bb7f418..6fac1eba919bc9 100644 --- a/lldb/test/API/commands/command/script/add/TestAddParsedCommand.py +++ b/lldb/test/API/commands/command/script/add/TestAddParsedCommand.py @@ -68,6 +68,57 @@ def run_one_repeat(self, commands, expected_num_errors): return results + def handle_completion( + self, + cmd_str, + exp_num_completions, + exp_matches, + exp_descriptions, + match_description, + ): + matches = lldb.SBStringList() + descriptions = lldb.SBStringList() + + interp = self.dbg.GetCommandInterpreter() + num_completions = interp.HandleCompletionWithDescriptions( + cmd_str, len(cmd_str), 0, 1000, matches, descriptions + ) + self.assertEqual( + num_completions, exp_num_completions, "Number of completions is right." + ) + num_matches = matches.GetSize() + self.assertEqual( + num_matches, + exp_matches.GetSize(), + "matches and expected matches of different lengths", + ) + num_descriptions = descriptions.GetSize() + if match_description: + self.assertEqual( + num_descriptions, + exp_descriptions.GetSize(), + "descriptions and expected of different lengths", + ) + + self.assertEqual( + matches.GetSize(), + num_completions + 1, + "The first element is the complete additional text", + ) + + for idx in range(0, num_matches): + match = matches.GetStringAtIndex(idx) + exp_match = exp_matches.GetStringAtIndex(idx) + self.assertEqual( + match, exp_match, f"{match} did not match expectation: {exp_match}" + ) + if match_description: + desc = descriptions.GetStringAtIndex(idx) + exp_desc = exp_descriptions.GetStringAtIndex(idx) + self.assertEqual( + desc, exp_desc, f"{desc} didn't match expectation: {exp_desc}" + ) + def pycmd_tests(self): source_dir = self.getSourceDir() test_file_path = os.path.join(source_dir, "test_commands.py") @@ -176,24 +227,10 @@ def cleanup(): descriptions = lldb.SBStringList() # First try an enum completion: - num_completions = interp.HandleCompletionWithDescriptions( - "no-args -e f", 12, 0, 1000, matches, descriptions - ) - self.assertEqual(num_completions, 1, "Only one completion for foo") - self.assertEqual( - matches.GetSize(), 2, "The first element is the complete additional text" - ) - self.assertEqual( - matches.GetStringAtIndex(0), "oo ", "And we got the right extra characters" - ) - self.assertEqual( - matches.GetStringAtIndex(1), "foo", "And we got the right match" - ) - self.assertEqual( - descriptions.GetSize(), 2, "descriptions matche the return length" - ) - # FIXME: we don't return descriptions for enum elements - # self.assertEqual(descriptions.GetStringAtIndex(1), "does foo things", "And we got the right description") + # Note - this is an enum so all the values are returned: + matches.AppendList(["oo ", "foo"], 2) + + self.handle_completion("no-args -e f", 1, matches, descriptions, False) # Now try an internal completer, the on disk file one is handy: partial_name = os.path.join(source_dir, "test_") @@ -201,24 +238,9 @@ def cleanup(): matches.Clear() descriptions.Clear() - num_completions = interp.HandleCompletionWithDescriptions( - cmd_str, len(cmd_str) - 1, 0, 1000, matches, descriptions - ) - self.assertEqual(num_completions, 1, "Only one completion for source file") - self.assertEqual(matches.GetSize(), 2, "The first element is the complete line") - self.assertEqual( - matches.GetStringAtIndex(0), - "commands.py' ", - "And we got the right extra characters", - ) - self.assertEqual( - matches.GetStringAtIndex(1), test_file_path, "And we got the right match" - ) - self.assertEqual( - descriptions.GetSize(), 2, "descriptions match the return length" - ) - # FIXME: we don't return descriptions for enum elements - # self.assertEqual(descriptions.GetStringAtIndex(1), "does foo things", "And we got the right description") + matches.AppendList(["commands.py' ", test_file_path], 2) + # We don't have descriptions for the file path completer: + self.handle_completion(cmd_str, 1, matches, descriptions, False) # Try a command with arguments. # FIXME: It should be enough to define an argument and it's type to get the completer @@ -231,6 +253,44 @@ def cleanup(): substrs=["0: First Argument", "1: Second Argument"], ) + # Now test custom completions - two-args has both option and arg completers. In both + # completers we return different values if the -p option is set, so we can test that too: + matches.Clear() + descriptions.Clear() + cmd_str = "two-args -p something -c other_" + matches.AppendString("something ") + matches.AppendString("other_something") + # This is a full match so no descriptions: + self.handle_completion(cmd_str, 1, matches, descriptions, False) + + matches.Clear() + descriptions.Clear() + cmd_str = "two-args -c other_" + matches.AppendList(["", "other_nice", "other_not_nice", "other_mediocre"], 4) + # The option doesn't return descriptions either: + self.handle_completion(cmd_str, 3, matches, descriptions, False) + + # Now try the argument - it says "no completions" if the proc_name was set: + matches.Clear() + descriptions.Clear() + cmd_str = "two-args -p something arg" + matches.AppendString("") + self.handle_completion(cmd_str, 0, matches, descriptions, False) + + cmd_str = "two-args arg_" + matches.Clear() + descriptions.Clear() + matches.AppendList(["", "arg_cool", "arg_yuck"], 3) + descriptions.AppendList(["", "good idea", "bad idea"], 3) + self.handle_completion(cmd_str, 2, matches, descriptions, True) + + # This one gets a single unique match: + cmd_str = "two-args correct_" + matches.Clear() + descriptions.Clear() + matches.AppendList(["answer ", "correct_answer"], 2) + self.handle_completion(cmd_str, 1, matches, descriptions, False) + # Now make sure get_repeat_command works properly: # no-args turns off auto-repeat diff --git a/lldb/test/API/commands/command/script/add/test_commands.py b/lldb/test/API/commands/command/script/add/test_commands.py index fcde6cd3ef6dc6..b15ea935c05867 100644 --- a/lldb/test/API/commands/command/script/add/test_commands.py +++ b/lldb/test/API/commands/command/script/add/test_commands.py @@ -18,7 +18,7 @@ def __call__(self, debugger, args_array, exe_ctx, result): for long_option, elem in opt_def.items(): dest = elem["dest"] result.AppendMessage( - f"{long_option} (set: {elem['_value_set']}): {object.__getattribute__(self.ov_parser, dest)}\n" + f"{long_option} (set: {elem['_value_set']}): {object.__getattribute__(self.get_parser(), dest)}\n" ) else: result.AppendMessage("No options\n") @@ -31,7 +31,6 @@ def __call__(self, debugger, args_array, exe_ctx, result): f"{idx}: {args_array.GetItemAtIndex(idx).GetStringValue(10000)}\n" ) - # Use these to make sure that get_repeat_command sends the right # command. no_args_repeat = None @@ -49,7 +48,8 @@ def register_lldb_command(cls, debugger, module_name): ParsedCommand.do_register_cmd(cls, debugger, module_name) def setup_command_definition(self): - self.ov_parser.add_option( + ov_parser = self.get_parser() + ov_parser.add_option( "b", "bool-arg", "a boolean arg, defaults to True", @@ -59,7 +59,7 @@ def setup_command_definition(self): default=True, ) - self.ov_parser.add_option( + ov_parser.add_option( "s", "shlib-name", "A shared library name.", @@ -69,7 +69,7 @@ def setup_command_definition(self): default=None, ) - self.ov_parser.add_option( + ov_parser.add_option( "d", "disk-file-name", "An on disk filename", @@ -78,7 +78,7 @@ def setup_command_definition(self): default=None, ) - self.ov_parser.add_option( + ov_parser.add_option( "l", "line-num", "A line number", @@ -88,7 +88,7 @@ def setup_command_definition(self): default=0, ) - self.ov_parser.add_option( + ov_parser.add_option( "e", "enum-option", "An enum, doesn't actually do anything", @@ -126,8 +126,9 @@ def register_lldb_command(cls, debugger, module_name): ParsedCommand.do_register_cmd(cls, debugger, module_name) def setup_command_definition(self): - self.ov_parser.add_argument_set( - [self.ov_parser.make_argument_element(lldb.eArgTypeSourceFile, "plain")] + ov_parser = self.get_parser() + ov_parser.add_argument_set( + [ov_parser.make_argument_element(lldb.eArgTypeSourceFile, "plain")] ) def get_repeat_command(self, command): @@ -154,7 +155,8 @@ def register_lldb_command(cls, debugger, module_name): ParsedCommand.do_register_cmd(cls, debugger, module_name) def setup_command_definition(self): - self.ov_parser.add_option( + ov_parser = self.get_parser() + ov_parser.add_option( "l", "language", "language defaults to None", @@ -164,7 +166,7 @@ def setup_command_definition(self): default=None, ) - self.ov_parser.add_option( + ov_parser.add_option( "c", "log-channel", "log channel - defaults to lldb", @@ -174,7 +176,7 @@ def setup_command_definition(self): default="lldb", ) - self.ov_parser.add_option( + ov_parser.add_option( "p", "process-name", "A process name, defaults to None", @@ -183,25 +185,23 @@ def setup_command_definition(self): default=None, ) - self.ov_parser.add_argument_set( + ov_parser.add_argument_set( [ - self.ov_parser.make_argument_element( + ov_parser.make_argument_element( lldb.eArgTypeClassName, "plain", [1, 2] ), - self.ov_parser.make_argument_element( + ov_parser.make_argument_element( lldb.eArgTypeOffset, "optional", [1, 2] ), ] ) - self.ov_parser.add_argument_set( + ov_parser.add_argument_set( [ - self.ov_parser.make_argument_element( + ov_parser.make_argument_element( lldb.eArgTypePythonClass, "plain", [3, 4] ), - self.ov_parser.make_argument_element( - lldb.eArgTypePid, "optional", [3, 4] - ), + ov_parser.make_argument_element(lldb.eArgTypePid, "optional", [3, 4]), ] ) @@ -210,6 +210,35 @@ def get_repeat_command(self, command): two_arg_repeat = command return command + " THIRD_ARG" + def handle_option_argument_completion(self, long_option, cursor_pos): + ov_parser = self.get_parser() + value = ov_parser.dest_for_option(long_option)[0 : cursor_pos + 1] + proc_value = ov_parser.proc_name + if proc_value != None: + new_str = value + proc_value + ret_arr = {"completion": new_str, "mode": "partial"} + return ret_arr + + ret_arr = {"values": [value + "nice", value + "not_nice", value + "mediocre"]} + return ret_arr + + def handle_argument_completion(self, args, arg_pos, cursor_pos): + ov_parser = self.get_parser() + orig_arg = args[arg_pos][0:cursor_pos] + if orig_arg == "correct_": + ret_arr = {"completion": "correct_answer"} + return ret_arr + + if ov_parser.was_set("process-name"): + # No completions if proc_name was set. + return True + + ret_arr = { + "values": [orig_arg + "cool", orig_arg + "yuck"], + "descriptions": ["good idea", "bad idea"], + } + return ret_arr + def get_short_help(self): return "This is my short help string" diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp index c67a2b4bf46e64..3faeb587c3a91b 100644 --- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp +++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp @@ -211,6 +211,19 @@ LLDBSwigPythonGetRepeatCommandForScriptedCommand(PyObject *implementor, return std::nullopt; } +StructuredData::DictionarySP +LLDBSwigPythonHandleArgumentCompletionForScriptedCommand( + PyObject *implementor, std::vector &args, size_t args_pos, + size_t pos_in_arg) { + return {}; +} + +StructuredData::DictionarySP +LLDBSwigPythonHandleOptionArgumentCompletionForScriptedCommand( + PyObject *implementor, llvm::StringRef &long_options, size_t char_in_arg) { + return {}; +} + bool lldb_private::python::SWIGBridge::LLDBSwigPythonCallModuleInit( const char *python_module_name, const char *session_dictionary_name, lldb::DebuggerSP debugger) { diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 3a566bbac36233..8b0b05c0ea424e 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -127,69 +127,6 @@ Example: 64-bit PTX for CUDA Driver API: ``nvptx64-nvidia-cuda`` NVPTX Intrinsics ================ -Address Space Conversion ------------------------- - -'``llvm.nvvm.ptr.*.to.gen``' Intrinsics -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Syntax: -""""""" - -These are overloaded intrinsics. You can use these on any pointer types. - -.. code-block:: llvm - - declare ptr @llvm.nvvm.ptr.global.to.gen.p0.p1(ptr addrspace(1)) - declare ptr @llvm.nvvm.ptr.shared.to.gen.p0.p3(ptr addrspace(3)) - declare ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4)) - declare ptr @llvm.nvvm.ptr.local.to.gen.p0.p5(ptr addrspace(5)) - -Overview: -""""""""" - -The '``llvm.nvvm.ptr.*.to.gen``' intrinsics convert a pointer in a non-generic -address space to a generic address space pointer. - -Semantics: -"""""""""" - -These intrinsics modify the pointer value to be a valid generic address space -pointer. - - -'``llvm.nvvm.ptr.gen.to.*``' Intrinsics -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Syntax: -""""""" - -These are overloaded intrinsics. You can use these on any pointer types. - -.. code-block:: llvm - - declare ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr) - declare ptr addrspace(3) @llvm.nvvm.ptr.gen.to.shared.p3.p0(ptr) - declare ptr addrspace(4) @llvm.nvvm.ptr.gen.to.constant.p4.p0(ptr) - declare ptr addrspace(5) @llvm.nvvm.ptr.gen.to.local.p5.p0(ptr) - -Overview: -""""""""" - -The '``llvm.nvvm.ptr.gen.to.*``' intrinsics convert a pointer in the generic -address space to a pointer in the target address space. Note that these -intrinsics are only useful if the address space of the target address space of -the pointer is known. It is not legal to use address space conversion -intrinsics to convert a pointer from one non-generic address space to another -non-generic address space. - -Semantics: -"""""""""" - -These intrinsics modify the pointer value to be a valid pointer in the target -non-generic address space. - - Reading PTX Special Registers ----------------------------- diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 27d6bc158b3c37..c85ea28ad9f8c7 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -69,6 +69,18 @@ Changes to the LLVM IR * ``llvm.nvvm.rotate.right.b64`` * ``llvm.nvvm.rotate.b64`` +* Remove the following intrinsics which can be replaced with an + ``addrspacecast``: + + * ``llvm.nvvm.ptr.gen.to.global`` + * ``llvm.nvvm.ptr.gen.to.shared`` + * ``llvm.nvvm.ptr.gen.to.constant`` + * ``llvm.nvvm.ptr.gen.to.local`` + * ``llvm.nvvm.ptr.global.to.gen`` + * ``llvm.nvvm.ptr.shared.to.gen`` + * ``llvm.nvvm.ptr.constant.to.gen`` + * ``llvm.nvvm.ptr.local.to.gen`` + Changes to LLVM infrastructure ------------------------------ diff --git a/llvm/docs/TableGen/ProgRef.rst b/llvm/docs/TableGen/ProgRef.rst index dcea3b721dae27..5cf48d6ed29786 100644 --- a/llvm/docs/TableGen/ProgRef.rst +++ b/llvm/docs/TableGen/ProgRef.rst @@ -223,12 +223,12 @@ TableGen provides "bang operators" that have a wide variety of uses: : !div !empty !eq !exists !filter : !find !foldl !foreach !ge !getdagarg : !getdagname !getdagop !gt !head !if - : !interleave !isa !le !listconcat !listremove - : !listsplat !logtwo !lt !mul !ne - : !not !or !range !repr !setdagarg - : !setdagname !setdagop !shl !size !sra - : !srl !strconcat !sub !subst !substr - : !tail !tolower !toupper !xor + : !interleave !isa !le !listconcat !listflatten + : !listremove !listsplat !logtwo !lt !mul + : !ne !not !or !range !repr + : !setdagarg !setdagname !setdagop !shl !size + : !sra !srl !strconcat !sub !subst + : !substr !tail !tolower !toupper !xor The ``!cond`` operator has a slightly different syntax compared to other bang operators, so it is defined separately: @@ -1832,6 +1832,12 @@ and non-0 as true. This operator concatenates the list arguments *list1*, *list2*, etc., and produces the resulting list. The lists must have the same element type. +``!listflatten(``\ *list*\ ``)`` + This operator flattens a list of lists *list* and produces a list with all + elements of the constituent lists concatenated. If *list* is of type + ``list>`` the resulting list is of type ``list``. If *list*'s + element type is not a list, the result is *list* itself. + ``!listremove(``\ *list1*\ ``,`` *list2*\ ``)`` This operator returns a copy of *list1* removing all elements that also occur in *list2*. The lists must have the same element type. diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h index 237d328721609b..bbe2741f44fc3d 100644 --- a/llvm/include/llvm/Analysis/PtrUseVisitor.h +++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h @@ -52,57 +52,54 @@ class PtrUseVisitorBase { /// analysis and whether the visit completed or aborted early. class PtrInfo { public: - PtrInfo() : AbortedInfo(nullptr, false), EscapedInfo(nullptr, false) {} - /// Reset the pointer info, clearing all state. void reset() { - AbortedInfo.setPointer(nullptr); - AbortedInfo.setInt(false); - EscapedInfo.setPointer(nullptr); - EscapedInfo.setInt(false); + AbortedInfo = nullptr; + EscapedInfo = nullptr; } /// Did we abort the visit early? - bool isAborted() const { return AbortedInfo.getInt(); } + bool isAborted() const { return AbortedInfo != nullptr; } /// Is the pointer escaped at some point? - bool isEscaped() const { return EscapedInfo.getInt(); } + bool isEscaped() const { return EscapedInfo != nullptr; } /// Get the instruction causing the visit to abort. /// \returns a pointer to the instruction causing the abort if one is /// available; otherwise returns null. - Instruction *getAbortingInst() const { return AbortedInfo.getPointer(); } + Instruction *getAbortingInst() const { return AbortedInfo; } /// Get the instruction causing the pointer to escape. /// \returns a pointer to the instruction which escapes the pointer if one /// is available; otherwise returns null. - Instruction *getEscapingInst() const { return EscapedInfo.getPointer(); } + Instruction *getEscapingInst() const { return EscapedInfo; } /// Mark the visit as aborted. Intended for use in a void return. /// \param I The instruction which caused the visit to abort, if available. - void setAborted(Instruction *I = nullptr) { - AbortedInfo.setInt(true); - AbortedInfo.setPointer(I); + void setAborted(Instruction *I) { + assert(I && "Expected a valid pointer in setAborted"); + AbortedInfo = I; } /// Mark the pointer as escaped. Intended for use in a void return. /// \param I The instruction which escapes the pointer, if available. - void setEscaped(Instruction *I = nullptr) { - EscapedInfo.setInt(true); - EscapedInfo.setPointer(I); + void setEscaped(Instruction *I) { + assert(I && "Expected a valid pointer in setEscaped"); + EscapedInfo = I; } /// Mark the pointer as escaped, and the visit as aborted. Intended /// for use in a void return. /// \param I The instruction which both escapes the pointer and aborts the /// visit, if available. - void setEscapedAndAborted(Instruction *I = nullptr) { + void setEscapedAndAborted(Instruction *I) { setEscaped(I); setAborted(I); } private: - PointerIntPair AbortedInfo, EscapedInfo; + Instruction *AbortedInfo = nullptr; + Instruction *EscapedInfo = nullptr; }; protected: diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h index e1376b7b25599e..e8c0ec42db477f 100644 --- a/llvm/include/llvm/CodeGen/MachineFunction.h +++ b/llvm/include/llvm/CodeGen/MachineFunction.h @@ -897,13 +897,14 @@ class LLVM_ABI MachineFunction { /// for debugger use. /// \returns true if no problems were found. bool verify(Pass *p = nullptr, const char *Banner = nullptr, - bool AbortOnError = true) const; + raw_ostream *OS = nullptr, bool AbortOnError = true) const; /// Run the current MachineFunction through the machine code verifier, useful /// for debugger use. /// \returns true if no problems were found. bool verify(LiveIntervals *LiveInts, SlotIndexes *Indexes, - const char *Banner = nullptr, bool AbortOnError = true) const; + const char *Banner = nullptr, raw_ostream *OS = nullptr, + bool AbortOnError = true) const; // Provide accessors for the MachineBasicBlock list... using iterator = BasicBlockListType::iterator; diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h index 7a131645893921..045ec7d3653119 100644 --- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h +++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h @@ -62,6 +62,10 @@ Libcall getLDEXP(EVT RetVT); /// UNKNOWN_LIBCALL if there is none. Libcall getFREXP(EVT RetVT); +/// getFSINCOS - Return the FSINCOS_* value for the given types, or +/// UNKNOWN_LIBCALL if there is none. +Libcall getFSINCOS(EVT RetVT); + /// Return the SYNC_FETCH_AND_* value for the given opcode and type, or /// UNKNOWN_LIBCALL if there is none. Libcall getSYNC(unsigned Opc, MVT VT); diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h index 2f475ed884a7e2..24cf982fc3ab0f 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h @@ -95,6 +95,19 @@ enum EdgeKind_x86_64 : Edge::Kind { /// Delta32, + /// A 16-bit delta. + /// + /// Delta from the fixup to the target. + /// + /// Fixup expression: + /// Fixup <- Target - Fixup + Addend : int16 + /// + /// Errors: + /// - The result of the fixup expression must fit into an int16, otherwise + /// an out-of-range error will be returned. + /// + Delta16, + /// An 8-bit delta. /// /// Delta from the fixup to the target. @@ -486,6 +499,15 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E, break; } + case Delta16: { + int64_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend(); + if (LLVM_LIKELY(isInt<16>(Value))) + *(little16_t *)FixupPtr = Value; + else + return makeTargetOutOfRangeError(G, B, E); + break; + } + case Delta8: { int64_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend(); if (LLVM_LIKELY(isInt<8>(Value))) diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index aa5294f5f9c909..7b8ffe417fccdb 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -30,10 +30,18 @@ // * llvm.nvvm.max.ui --> select(x ule y, x, y) // * llvm.nvvm.max.ull --> ibid. // * llvm.nvvm.h2f --> llvm.convert.to.fp16.f32 -// * llvm.nvvm.bitcast.f2i --> bitcast -// * llvm.nvvm.bitcast.i2f --> ibid. -// * llvm.nvvm.bitcast.d2ll --> ibid. -// * llvm.nvvm.bitcast.ll2d --> ibid. +// * llvm.nvvm.bitcast.f2i --> bitcast +// * llvm.nvvm.bitcast.i2f --> ibid. +// * llvm.nvvm.bitcast.d2ll --> ibid. +// * llvm.nvvm.bitcast.ll2d --> ibid. +// * llvm.nvvm.ptr.gen.to.global --> addrspacecast +// * llvm.nvvm.ptr.gen.to.shared --> ibid. +// * llvm.nvvm.ptr.gen.to.constant --> ibid. +// * llvm.nvvm.ptr.gen.to.local --> ibid. +// * llvm.nvvm.ptr.global.to.gen --> ibid. +// * llvm.nvvm.ptr.shared.to.gen --> ibid. +// * llvm.nvvm.ptr.constant.to.gen --> ibid. +// * llvm.nvvm.ptr.local.to.gen --> ibid. def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr @@ -1602,40 +1610,6 @@ def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty], [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture>], "llvm.nvvm.ldg.global.p">; -// Use for generic pointers -// - These intrinsics are used to convert address spaces. -// - The input pointer and output pointer must have the same type, except for -// the address-space. (This restriction is not enforced here as there is -// currently no way to describe it). -// - This complements the llvm bitcast, which can be used to cast one type -// of pointer to another type of pointer, while the address space remains -// the same. -def int_nvvm_ptr_local_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.ptr.local.to.gen">; -def int_nvvm_ptr_shared_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.ptr.shared.to.gen">; -def int_nvvm_ptr_global_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.ptr.global.to.gen">; -def int_nvvm_ptr_constant_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.ptr.constant.to.gen">; - -def int_nvvm_ptr_gen_to_global: DefaultAttrsIntrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.ptr.gen.to.global">; -def int_nvvm_ptr_gen_to_shared: DefaultAttrsIntrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.ptr.gen.to.shared">; -def int_nvvm_ptr_gen_to_local: DefaultAttrsIntrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.ptr.gen.to.local">; -def int_nvvm_ptr_gen_to_constant: DefaultAttrsIntrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.ptr.gen.to.constant">; - // Used in nvvm internally to help address space opt and ptx code generation // This is for params that are passed to kernel functions by pointer by-val. def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty], diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index d5e239e70da613..d4c907ce8327dd 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -1935,6 +1935,22 @@ class Instruction : public sandboxir::User { /// \Returns this Instruction's opcode. Note that SandboxIR has its own opcode /// state to allow for new SandboxIR-specific instructions. Opcode getOpcode() const { return Opc; } + + // TODO: Missing function getOpcodeName(). + + bool isTerminator() const { + return cast(Val)->isTerminator(); + } + bool isUnaryOp() const { return cast(Val)->isUnaryOp(); } + bool isBinaryOp() const { return cast(Val)->isBinaryOp(); } + bool isIntDivRem() const { + return cast(Val)->isIntDivRem(); + } + bool isShift() const { return cast(Val)->isShift(); } + bool isCast() const { return cast(Val)->isCast(); } + + // TODO: More missing functions + /// Detach this from its parent BasicBlock without deleting it. void removeFromParent(); /// Detach this Value from its parent and delete it. diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h index 5348c1177f63ed..4cd73c3f675527 100644 --- a/llvm/include/llvm/TableGen/Record.h +++ b/llvm/include/llvm/TableGen/Record.h @@ -847,7 +847,8 @@ class UnOpInit : public OpInit, public FoldingSetNode { EMPTY, GETDAGOP, LOG2, - REPR + REPR, + LISTFLATTEN, }; private: diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index 8d6d800d761474..3a00b8ec4771dd 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -634,7 +634,7 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF, MF.getSubtarget().mirFileLoaded(MF); - MF.verify(); + MF.verify(nullptr, nullptr, &errs()); return false; } diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index be783bc4e29738..a52c82d77ca644 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -119,10 +119,10 @@ static cl::opt LoopToColdBlockRatio( "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden); -static cl::opt ForceLoopColdBlock( - "force-loop-cold-block", - cl::desc("Force outlining cold blocks from loops."), - cl::init(false), cl::Hidden); +static cl::opt + ForceLoopColdBlock("force-loop-cold-block", + cl::desc("Force outlining cold blocks from loops."), + cl::init(false), cl::Hidden); static cl::opt PreciseRotationCost("precise-rotation-cost", @@ -147,43 +147,43 @@ static cl::opt JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden); static cl::opt -TailDupPlacement("tail-dup-placement", - cl::desc("Perform tail duplication during placement. " - "Creates more fallthrough opportunites in " - "outline branches."), - cl::init(true), cl::Hidden); + TailDupPlacement("tail-dup-placement", + cl::desc("Perform tail duplication during placement. " + "Creates more fallthrough opportunites in " + "outline branches."), + cl::init(true), cl::Hidden); static cl::opt -BranchFoldPlacement("branch-fold-placement", - cl::desc("Perform branch folding during placement. " - "Reduces code size."), - cl::init(true), cl::Hidden); + BranchFoldPlacement("branch-fold-placement", + cl::desc("Perform branch folding during placement. " + "Reduces code size."), + cl::init(true), cl::Hidden); // Heuristic for tail duplication. static cl::opt TailDupPlacementThreshold( "tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " - "that won't conflict."), cl::init(2), - cl::Hidden); + "that won't conflict."), + cl::init(2), cl::Hidden); // Heuristic for aggressive tail duplication. static cl::opt TailDupPlacementAggressiveThreshold( "tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " - "have a threshold that won't conflict."), cl::init(4), - cl::Hidden); + "have a threshold that won't conflict."), + cl::init(4), cl::Hidden); // Heuristic for tail duplication. static cl::opt TailDupPlacementPenalty( "tail-dup-placement-penalty", - cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " - "Copying can increase fallthrough, but it also increases icache " - "pressure. This parameter controls the penalty to account for that. " - "Percent as integer."), - cl::init(2), - cl::Hidden); + cl::desc( + "Cost penalty for blocks that can avoid breaking CFG by copying. " + "Copying can increase fallthrough, but it also increases icache " + "pressure. This parameter controls the penalty to account for that. " + "Percent as integer."), + cl::init(2), cl::Hidden); // Heuristic for tail duplication if profile count is used in cost model. static cl::opt TailDupProfilePercentThreshold( @@ -198,8 +198,7 @@ static cl::opt TriangleChainCount( "triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), - cl::init(2), - cl::Hidden); + cl::init(2), cl::Hidden); // Use case: When block layout is visualized after MBP pass, the basic blocks // are labeled in layout order; meanwhile blocks could be numbered in a @@ -292,8 +291,8 @@ class BlockChain { iterator end() { return Blocks.end(); } const_iterator end() const { return Blocks.end(); } - bool remove(MachineBasicBlock* BB) { - for(iterator i = begin(); i != end(); ++i) { + bool remove(MachineBasicBlock *BB) { + for (iterator i = begin(); i != end(); ++i) { if (*i == BB) { Blocks.erase(i); return true; @@ -405,6 +404,8 @@ class MachineBlockPlacement : public MachineFunctionPass { ProfileSummaryInfo *PSI = nullptr; + TargetPassConfig *PassConfig = nullptr; + /// Duplicator used to duplicate tails during placement. /// /// Placement decisions can open up new tail duplication opportunities, but @@ -415,6 +416,8 @@ class MachineBlockPlacement : public MachineFunctionPass { /// Partial tail duplication threshold. BlockFrequency DupThreshold; + unsigned TailDupSize; + /// True: use block profile count to compute tail duplication cost. /// False: use block frequency to compute tail duplication cost. bool UseProfileCount = false; @@ -459,26 +462,24 @@ class MachineBlockPlacement : public MachineFunctionPass { /// Scale the DupThreshold according to basic block size. BlockFrequency scaleThreshold(MachineBasicBlock *BB); - void initDupThreshold(); + void initTailDupThreshold(); /// Decrease the UnscheduledPredecessors count for all blocks in chain, and /// if the count goes to 0, add them to the appropriate work list. - void markChainSuccessors( - const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB, - const BlockFilterSet *BlockFilter = nullptr); + void markChainSuccessors(const BlockChain &Chain, + const MachineBasicBlock *LoopHeaderBB, + const BlockFilterSet *BlockFilter = nullptr); /// Decrease the UnscheduledPredecessors count for a single block, and /// if the count goes to 0, add them to the appropriate work list. - void markBlockSuccessors( - const BlockChain &Chain, const MachineBasicBlock *BB, - const MachineBasicBlock *LoopHeaderBB, - const BlockFilterSet *BlockFilter = nullptr); + void markBlockSuccessors(const BlockChain &Chain, const MachineBasicBlock *BB, + const MachineBasicBlock *LoopHeaderBB, + const BlockFilterSet *BlockFilter = nullptr); BranchProbability - collectViableSuccessors( - const MachineBasicBlock *BB, const BlockChain &Chain, - const BlockFilterSet *BlockFilter, - SmallVector &Successors); + collectViableSuccessors(const MachineBasicBlock *BB, const BlockChain &Chain, + const BlockFilterSet *BlockFilter, + SmallVector &Successors); bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred, BlockFilterSet *BlockFilter); void findDuplicateCandidates(SmallVectorImpl &Candidates, @@ -496,16 +497,19 @@ class MachineBlockPlacement : public MachineFunctionPass { MachineFunction::iterator &PrevUnplacedBlockIt, BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt, bool &DuplicatedToLPred); - bool hasBetterLayoutPredecessor( - const MachineBasicBlock *BB, const MachineBasicBlock *Succ, - const BlockChain &SuccChain, BranchProbability SuccProb, - BranchProbability RealSuccProb, const BlockChain &Chain, - const BlockFilterSet *BlockFilter); - BlockAndTailDupResult selectBestSuccessor( - const MachineBasicBlock *BB, const BlockChain &Chain, - const BlockFilterSet *BlockFilter); - MachineBasicBlock *selectBestCandidateBlock( - const BlockChain &Chain, SmallVectorImpl &WorkList); + bool hasBetterLayoutPredecessor(const MachineBasicBlock *BB, + const MachineBasicBlock *Succ, + const BlockChain &SuccChain, + BranchProbability SuccProb, + BranchProbability RealSuccProb, + const BlockChain &Chain, + const BlockFilterSet *BlockFilter); + BlockAndTailDupResult selectBestSuccessor(const MachineBasicBlock *BB, + const BlockChain &Chain, + const BlockFilterSet *BlockFilter); + MachineBasicBlock * + selectBestCandidateBlock(const BlockChain &Chain, + SmallVectorImpl &WorkList); MachineBasicBlock * getFirstUnplacedBlock(const BlockChain &PlacedChain, MachineFunction::iterator &PrevUnplacedBlockIt); @@ -536,20 +540,19 @@ class MachineBlockPlacement : public MachineFunctionPass { const MachineBasicBlock *ExitBB, const BlockFilterSet &LoopBlockSet); MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop, - const MachineLoop &L, const BlockFilterSet &LoopBlockSet); - MachineBasicBlock *findBestLoopTop( - const MachineLoop &L, const BlockFilterSet &LoopBlockSet); - MachineBasicBlock *findBestLoopExit( - const MachineLoop &L, const BlockFilterSet &LoopBlockSet, - BlockFrequency &ExitFreq); + const MachineLoop &L, + const BlockFilterSet &LoopBlockSet); + MachineBasicBlock *findBestLoopTop(const MachineLoop &L, + const BlockFilterSet &LoopBlockSet); + MachineBasicBlock *findBestLoopExit(const MachineLoop &L, + const BlockFilterSet &LoopBlockSet, + BlockFrequency &ExitFreq); BlockFilterSet collectLoopBlockSet(const MachineLoop &L); void buildLoopChains(const MachineLoop &L); - void rotateLoop( - BlockChain &LoopChain, const MachineBasicBlock *ExitingBB, - BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet); - void rotateLoopWithProfile( - BlockChain &LoopChain, const MachineLoop &L, - const BlockFilterSet &LoopBlockSet); + void rotateLoop(BlockChain &LoopChain, const MachineBasicBlock *ExitingBB, + BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet); + void rotateLoopWithProfile(BlockChain &LoopChain, const MachineLoop &L, + const BlockFilterSet &LoopBlockSet); void buildCFGChains(); void optimizeBranches(); void alignBlocks(); @@ -558,10 +561,10 @@ class MachineBlockPlacement : public MachineFunctionPass { bool shouldTailDuplicate(MachineBasicBlock *BB); /// Check the edge frequencies to see if tail duplication will increase /// fallthroughs. - bool isProfitableToTailDup( - const MachineBasicBlock *BB, const MachineBasicBlock *Succ, - BranchProbability QProb, - const BlockChain &Chain, const BlockFilterSet *BlockFilter); + bool isProfitableToTailDup(const MachineBasicBlock *BB, + const MachineBasicBlock *Succ, + BranchProbability QProb, const BlockChain &Chain, + const BlockFilterSet *BlockFilter); /// Check for a trellis layout. bool isTrellis(const MachineBasicBlock *BB, @@ -582,9 +585,10 @@ class MachineBlockPlacement : public MachineFunctionPass { /// Returns true if a block can tail duplicate into all unplaced /// predecessors. Filters based on loop. - bool canTailDuplicateUnplacedPreds( - const MachineBasicBlock *BB, MachineBasicBlock *Succ, - const BlockChain &Chain, const BlockFilterSet *BlockFilter); + bool canTailDuplicateUnplacedPreds(const MachineBasicBlock *BB, + MachineBasicBlock *Succ, + const BlockChain &Chain, + const BlockFilterSet *BlockFilter); /// Find chains of triangles to tail-duplicate where a global analysis works, /// but a local analysis would not find them. @@ -802,8 +806,8 @@ bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) { /// Compare 2 BlockFrequency's with a small penalty for \p A. /// In order to be conservative, we apply a X% penalty to account for /// increased icache pressure and static heuristics. For small frequencies -/// we use only the numerators to improve accuracy. For simplicity, we assume the -/// penalty is less than 100% +/// we use only the numerators to improve accuracy. For simplicity, we assume +/// the penalty is less than 100% /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere. static bool greaterWithBias(BlockFrequency A, BlockFrequency B, BlockFrequency EntryFreq) { @@ -819,8 +823,8 @@ static bool greaterWithBias(BlockFrequency A, BlockFrequency B, /// considering duplication. bool MachineBlockPlacement::isProfitableToTailDup( const MachineBasicBlock *BB, const MachineBasicBlock *Succ, - BranchProbability QProb, - const BlockChain &Chain, const BlockFilterSet *BlockFilter) { + BranchProbability QProb, const BlockChain &Chain, + const BlockFilterSet *BlockFilter) { // We need to do a probability calculation to make sure this is profitable. // First: does succ have a successor that post-dominates? This affects the // calculation. The 2 relevant cases are: @@ -876,12 +880,12 @@ bool MachineBlockPlacement::isProfitableToTailDup( // from BB. auto SuccBestPred = BlockFrequency(0); for (MachineBasicBlock *SuccPred : Succ->predecessors()) { - if (SuccPred == Succ || SuccPred == BB - || BlockToChain[SuccPred] == &Chain - || (BlockFilter && !BlockFilter->count(SuccPred))) + if (SuccPred == Succ || SuccPred == BB || + BlockToChain[SuccPred] == &Chain || + (BlockFilter && !BlockFilter->count(SuccPred))) continue; - auto Freq = MBFI->getBlockFreq(SuccPred) - * MBPI->getEdgeProbability(SuccPred, Succ); + auto Freq = + MBFI->getBlockFreq(SuccPred) * MBPI->getEdgeProbability(SuccPred, Succ); if (Freq > SuccBestPred) SuccBestPred = Freq; } @@ -1137,7 +1141,7 @@ MachineBlockPlacement::getBestTrellisSuccessor( } // We have already computed the optimal edge for the other side of the // trellis. - ComputedEdges[BestB.Src] = { BestB.Dest, false }; + ComputedEdges[BestB.Src] = {BestB.Dest, false}; auto TrellisSucc = BestA.Dest; LLVM_DEBUG(BranchProbability SuccProb = getAdjustedProbability( @@ -1169,8 +1173,8 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds( // Make sure all unplaced and unfiltered predecessors can be // tail-duplicated into. // Skip any blocks that are already placed or not in this loop. - if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) - || (BlockToChain[Pred] == &Chain && !Succ->succ_empty())) + if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) || + (BlockToChain[Pred] == &Chain && !Succ->succ_empty())) continue; if (!TailDup.canTailDuplicate(Succ, Pred)) { if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors)) @@ -1289,9 +1293,7 @@ void MachineBlockPlacement::precomputeTriangleChains() { unsigned count() const { return Edges.size() - 1; } - MachineBasicBlock *getKey() const { - return Edges.back(); - } + MachineBasicBlock *getKey() const { return Edges.back(); } }; if (TriangleChainCount == 0) @@ -1326,7 +1328,7 @@ void MachineBlockPlacement::precomputeTriangleChains() { bool CanTailDuplicate = true; // If PDom can't tail-duplicate into it's non-BB predecessors, then this // isn't the kind of triangle we're looking for. - for (MachineBasicBlock* Pred : PDom->predecessors()) { + for (MachineBasicBlock *Pred : PDom->predecessors()) { if (Pred == &BB) continue; if (!TailDup.canTailDuplicate(PDom, Pred)) { @@ -1386,8 +1388,8 @@ void MachineBlockPlacement::precomputeTriangleChains() { // When profile is not present, return the StaticLikelyProb. // When profile is available, we need to handle the triangle-shape CFG. -static BranchProbability getLayoutSuccessorProbThreshold( - const MachineBasicBlock *BB) { +static BranchProbability +getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB) { if (!BB->getParent()->getFunction().hasProfileData()) return BranchProbability(StaticLikelyProb, 100); if (BB->succ_size() == 2) { @@ -1551,8 +1553,8 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( for (MachineBasicBlock *Pred : Succ->predecessors()) { BlockChain *PredChain = BlockToChain[Pred]; if (Pred == Succ || PredChain == &SuccChain || - (BlockFilter && !BlockFilter->count(Pred)) || - PredChain == &Chain || Pred != *std::prev(PredChain->end()) || + (BlockFilter && !BlockFilter->count(Pred)) || PredChain == &Chain || + Pred != *std::prev(PredChain->end()) || // This check is redundant except for look ahead. This function is // called for lookahead by isProfitableToTailDup when BB hasn't been // placed yet. @@ -1599,12 +1601,12 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( /// \returns The best successor block found, or null if none are viable, along /// with a boolean indicating if tail duplication is necessary. MachineBlockPlacement::BlockAndTailDupResult -MachineBlockPlacement::selectBestSuccessor( - const MachineBasicBlock *BB, const BlockChain &Chain, - const BlockFilterSet *BlockFilter) { +MachineBlockPlacement::selectBestSuccessor(const MachineBasicBlock *BB, + const BlockChain &Chain, + const BlockFilterSet *BlockFilter) { const BranchProbability HotProb(StaticLikelyProb, 100); - BlockAndTailDupResult BestSucc = { nullptr, false }; + BlockAndTailDupResult BestSucc = {nullptr, false}; auto BestProb = BranchProbability::getZero(); SmallVector Successors; @@ -1684,8 +1686,8 @@ MachineBlockPlacement::selectBestSuccessor( std::tie(DupProb, Succ) = Tup; if (DupProb < BestProb) break; - if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) - && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) { + if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) && + (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) { LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ) << ", probability: " << DupProb << " (Tail Duplicate)\n"); @@ -1822,8 +1824,7 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock( } void MachineBlockPlacement::fillWorkLists( - const MachineBasicBlock *MBB, - SmallPtrSetImpl &UpdatedPreds, + const MachineBasicBlock *MBB, SmallPtrSetImpl &UpdatedPreds, const BlockFilterSet *BlockFilter = nullptr) { BlockChain &Chain = *BlockToChain[MBB]; if (!UpdatedPreds.insert(&Chain).second) @@ -1854,9 +1855,9 @@ void MachineBlockPlacement::fillWorkLists( BlockWorkList.push_back(BB); } -void MachineBlockPlacement::buildChain( - const MachineBasicBlock *HeadBB, BlockChain &Chain, - BlockFilterSet *BlockFilter) { +void MachineBlockPlacement::buildChain(const MachineBasicBlock *HeadBB, + BlockChain &Chain, + BlockFilterSet *BlockFilter) { assert(HeadBB && "BB must not be null.\n"); assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n"); MachineFunction::iterator PrevUnplacedBlockIt = F->begin(); @@ -1872,16 +1873,14 @@ void MachineBlockPlacement::buildChain( assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop."); assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain."); - // Look for the best viable successor if there is one to place immediately // after this block. auto Result = selectBestSuccessor(BB, Chain, BlockFilter); - MachineBasicBlock* BestSucc = Result.BB; + MachineBasicBlock *BestSucc = Result.BB; bool ShouldTailDup = Result.ShouldTailDup; if (allowTailDupPlacement()) - ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(BB, BestSucc, - Chain, - BlockFilter)); + ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds( + BB, BestSucc, Chain, BlockFilter)); // If an immediate successor isn't available, look for the best viable // block among those we've identified as not violating the loop's CFG at @@ -1918,8 +1917,8 @@ void MachineBlockPlacement::buildChain( // Place this block, updating the datastructures to reflect its placement. BlockChain &SuccChain = *BlockToChain[BestSucc]; - // Zero out UnscheduledPredecessors for the successor we're about to merge in case - // we selected a successor that didn't fit naturally into the CFG. + // Zero out UnscheduledPredecessors for the successor we're about to merge + // in case we selected a successor that didn't fit naturally into the CFG. SuccChain.UnscheduledPredecessors = 0; LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to " << getBlockName(BestSucc) << "\n"); @@ -1946,10 +1945,8 @@ void MachineBlockPlacement::buildChain( // If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't // layout the other successor below it, so it can't reduce taken branch. // In this case we keep its original layout. -bool -MachineBlockPlacement::canMoveBottomBlockToTop( - const MachineBasicBlock *BottomBlock, - const MachineBasicBlock *OldTop) { +bool MachineBlockPlacement::canMoveBottomBlockToTop( + const MachineBasicBlock *BottomBlock, const MachineBasicBlock *OldTop) { if (BottomBlock->pred_size() != 1) return true; MachineBasicBlock *Pred = *BottomBlock->pred_begin(); @@ -1967,9 +1964,8 @@ MachineBlockPlacement::canMoveBottomBlockToTop( // Find out the possible fall through frequence to the top of a loop. BlockFrequency -MachineBlockPlacement::TopFallThroughFreq( - const MachineBasicBlock *Top, - const BlockFilterSet &LoopBlockSet) { +MachineBlockPlacement::TopFallThroughFreq(const MachineBasicBlock *Top, + const BlockFilterSet &LoopBlockSet) { BlockFrequency MaxFreq = BlockFrequency(0); for (MachineBasicBlock *Pred : Top->predecessors()) { BlockChain *PredChain = BlockToChain[Pred]; @@ -1991,8 +1987,8 @@ MachineBlockPlacement::TopFallThroughFreq( } } if (TopOK) { - BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) * - MBPI->getEdgeProbability(Pred, Top); + BlockFrequency EdgeFreq = + MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Top); if (EdgeFreq > MaxFreq) MaxFreq = EdgeFreq; } @@ -2022,19 +2018,16 @@ MachineBlockPlacement::TopFallThroughFreq( // |- // V // -BlockFrequency -MachineBlockPlacement::FallThroughGains( - const MachineBasicBlock *NewTop, - const MachineBasicBlock *OldTop, - const MachineBasicBlock *ExitBB, - const BlockFilterSet &LoopBlockSet) { +BlockFrequency MachineBlockPlacement::FallThroughGains( + const MachineBasicBlock *NewTop, const MachineBasicBlock *OldTop, + const MachineBasicBlock *ExitBB, const BlockFilterSet &LoopBlockSet) { BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet); BlockFrequency FallThrough2Exit = BlockFrequency(0); if (ExitBB) - FallThrough2Exit = MBFI->getBlockFreq(NewTop) * - MBPI->getEdgeProbability(NewTop, ExitBB); - BlockFrequency BackEdgeFreq = MBFI->getBlockFreq(NewTop) * - MBPI->getEdgeProbability(NewTop, OldTop); + FallThrough2Exit = + MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, ExitBB); + BlockFrequency BackEdgeFreq = + MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, OldTop); // Find the best Pred of NewTop. MachineBasicBlock *BestPred = nullptr; @@ -2113,10 +2106,8 @@ MachineBlockPlacement::FallThroughGains( /// At the same time, move it before old top increases the taken branch /// to loop exit block, so the reduced taken branch will be compared with /// the increased taken branch to the loop exit block. -MachineBasicBlock * -MachineBlockPlacement::findBestLoopTopHelper( - MachineBasicBlock *OldTop, - const MachineLoop &L, +MachineBasicBlock *MachineBlockPlacement::findBestLoopTopHelper( + MachineBasicBlock *OldTop, const MachineLoop &L, const BlockFilterSet &LoopBlockSet) { // Check that the header hasn't been fused with a preheader block due to // crazy branches. If it has, we need to start with the header at the top to @@ -2153,8 +2144,8 @@ MachineBlockPlacement::findBestLoopTopHelper( if (!canMoveBottomBlockToTop(Pred, OldTop)) continue; - BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB, - LoopBlockSet); + BlockFrequency Gains = + FallThroughGains(Pred, OldTop, OtherBB, LoopBlockSet); if ((Gains > BlockFrequency(0)) && (Gains > BestGains || ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) { @@ -2204,7 +2195,7 @@ MachineBlockPlacement::findBestLoopTop(const MachineLoop &L, OldTop = NewTop; NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet); if (NewTop != OldTop) - ComputedEdges[NewTop] = { OldTop, false }; + ComputedEdges[NewTop] = {OldTop, false}; } return NewTop; } @@ -2336,10 +2327,8 @@ MachineBlockPlacement::findBestLoopExit(const MachineLoop &L, /// /// 1. Look for a Pred that can be layout before Top. /// 2. Check if Top is the most possible successor of Pred. -bool -MachineBlockPlacement::hasViableTopFallthrough( - const MachineBasicBlock *Top, - const BlockFilterSet &LoopBlockSet) { +bool MachineBlockPlacement::hasViableTopFallthrough( + const MachineBasicBlock *Top, const BlockFilterSet &LoopBlockSet) { for (MachineBasicBlock *Pred : Top->predecessors()) { BlockChain *PredChain = BlockToChain[Pred]; if (!LoopBlockSet.count(Pred) && @@ -2491,7 +2480,7 @@ void MachineBlockPlacement::rotateLoopWithProfile( if (!LoopBlockSet.count(Pred) && (!PredChain || Pred == *std::prev(PredChain->end()))) { auto EdgeFreq = MBFI->getBlockFreq(Pred) * - MBPI->getEdgeProbability(Pred, ChainHeaderBB); + MBPI->getEdgeProbability(Pred, ChainHeaderBB); auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost); // If the predecessor has only an unconditional jump to the header, we // need to consider the cost of this jump. @@ -2951,12 +2940,16 @@ void MachineBlockPlacement::alignBlocks() { // exclusively on the loop info here so that we can align backedges in // unnatural CFGs and backedges that were introduced purely because of the // loop rotations done during this layout pass. - if (F->getFunction().hasMinSize() || - (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize())) - return; + if (!AlignAllBlock && !AlignAllNonFallThruBlocks) { + if (F->getFunction().hasMinSize() || + (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize())) + return; + } + BlockChain &FunctionChain = *BlockToChain[&F->front()]; + // Empty chain. if (FunctionChain.begin() == FunctionChain.end()) - return; // Empty chain. + return; const BranchProbability ColdProb(1, 5); // 20% BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front()); @@ -3052,6 +3045,33 @@ void MachineBlockPlacement::alignBlocks() { DetermineMaxAlignmentPadding(); } } + + const bool HasMaxBytesOverride = + MaxBytesForAlignmentOverride.getNumOccurrences() > 0; + + if (AlignAllBlock) + // Align all of the blocks in the function to a specific alignment. + for (MachineBasicBlock &MBB : *F) { + if (HasMaxBytesOverride) + MBB.setAlignment(Align(1ULL << AlignAllBlock), + MaxBytesForAlignmentOverride); + else + MBB.setAlignment(Align(1ULL << AlignAllBlock)); + } + else if (AlignAllNonFallThruBlocks) { + // Align all of the blocks that have no fall-through predecessors to a + // specific alignment. + for (auto MBI = std::next(F->begin()), MBE = F->end(); MBI != MBE; ++MBI) { + auto LayoutPred = std::prev(MBI); + if (!LayoutPred->isSuccessor(&*MBI)) { + if (HasMaxBytesOverride) + MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks), + MaxBytesForAlignmentOverride); + else + MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks)); + } + } + } } /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if @@ -3142,67 +3162,66 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock( // This has to be a callback because none of it can be done after // BB is deleted. bool Removed = false; - auto RemovalCallback = - [&](MachineBasicBlock *RemBB) { - // Signal to outer function - Removed = true; - - // Conservative default. - bool InWorkList = true; - // Remove from the Chain and Chain Map - if (BlockToChain.count(RemBB)) { - BlockChain *Chain = BlockToChain[RemBB]; - InWorkList = Chain->UnscheduledPredecessors == 0; - Chain->remove(RemBB); - BlockToChain.erase(RemBB); - } - - // Handle the unplaced block iterator - if (&(*PrevUnplacedBlockIt) == RemBB) { - PrevUnplacedBlockIt++; - } - - // Handle the Work Lists - if (InWorkList) { - SmallVectorImpl &RemoveList = BlockWorkList; - if (RemBB->isEHPad()) - RemoveList = EHPadWorkList; - llvm::erase(RemoveList, RemBB); - } - - // Handle the filter set - if (BlockFilter) { - auto It = llvm::find(*BlockFilter, RemBB); - // Erase RemBB from BlockFilter, and keep PrevUnplacedBlockInFilterIt - // pointing to the same element as before. - if (It != BlockFilter->end()) { - if (It < PrevUnplacedBlockInFilterIt) { - const MachineBasicBlock *PrevBB = *PrevUnplacedBlockInFilterIt; - // BlockFilter is a SmallVector so all elements after RemBB are - // shifted to the front by 1 after its deletion. - auto Distance = PrevUnplacedBlockInFilterIt - It - 1; - PrevUnplacedBlockInFilterIt = BlockFilter->erase(It) + Distance; - assert(*PrevUnplacedBlockInFilterIt == PrevBB); - (void)PrevBB; - } else if (It == PrevUnplacedBlockInFilterIt) - // The block pointed by PrevUnplacedBlockInFilterIt is erased, we - // have to set it to the next element. - PrevUnplacedBlockInFilterIt = BlockFilter->erase(It); - else - BlockFilter->erase(It); - } - } + auto RemovalCallback = [&](MachineBasicBlock *RemBB) { + // Signal to outer function + Removed = true; + + // Conservative default. + bool InWorkList = true; + // Remove from the Chain and Chain Map + if (BlockToChain.count(RemBB)) { + BlockChain *Chain = BlockToChain[RemBB]; + InWorkList = Chain->UnscheduledPredecessors == 0; + Chain->remove(RemBB); + BlockToChain.erase(RemBB); + } + + // Handle the unplaced block iterator + if (&(*PrevUnplacedBlockIt) == RemBB) { + PrevUnplacedBlockIt++; + } + + // Handle the Work Lists + if (InWorkList) { + SmallVectorImpl &RemoveList = BlockWorkList; + if (RemBB->isEHPad()) + RemoveList = EHPadWorkList; + llvm::erase(RemoveList, RemBB); + } + + // Handle the filter set + if (BlockFilter) { + auto It = llvm::find(*BlockFilter, RemBB); + // Erase RemBB from BlockFilter, and keep PrevUnplacedBlockInFilterIt + // pointing to the same element as before. + if (It != BlockFilter->end()) { + if (It < PrevUnplacedBlockInFilterIt) { + const MachineBasicBlock *PrevBB = *PrevUnplacedBlockInFilterIt; + // BlockFilter is a SmallVector so all elements after RemBB are + // shifted to the front by 1 after its deletion. + auto Distance = PrevUnplacedBlockInFilterIt - It - 1; + PrevUnplacedBlockInFilterIt = BlockFilter->erase(It) + Distance; + assert(*PrevUnplacedBlockInFilterIt == PrevBB); + (void)PrevBB; + } else if (It == PrevUnplacedBlockInFilterIt) + // The block pointed by PrevUnplacedBlockInFilterIt is erased, we + // have to set it to the next element. + PrevUnplacedBlockInFilterIt = BlockFilter->erase(It); + else + BlockFilter->erase(It); + } + } - // Remove the block from loop info. - MLI->removeBlock(RemBB); - if (RemBB == PreferredLoopExit) - PreferredLoopExit = nullptr; + // Remove the block from loop info. + MLI->removeBlock(RemBB); + if (RemBB == PreferredLoopExit) + PreferredLoopExit = nullptr; - LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: " - << getBlockName(RemBB) << "\n"); - }; + LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: " << getBlockName(RemBB) + << "\n"); + }; auto RemovalCallbackRef = - function_ref(RemovalCallback); + function_ref(RemovalCallback); SmallVector DuplicatedPreds; bool IsSimple = TailDup.isSimpleBB(BB); @@ -3223,11 +3242,11 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock( DuplicatedToLPred = false; for (MachineBasicBlock *Pred : DuplicatedPreds) { // We're only looking for unscheduled predecessors that match the filter. - BlockChain* PredChain = BlockToChain[Pred]; + BlockChain *PredChain = BlockToChain[Pred]; if (Pred == LPred) DuplicatedToLPred = true; - if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred)) - || PredChain == &Chain) + if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred)) || + PredChain == &Chain) continue; for (MachineBasicBlock *NewSucc : Pred->successors()) { if (BlockFilter && !BlockFilter->count(NewSucc)) @@ -3297,8 +3316,7 @@ bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB, // Find out the predecessors of BB and BB can be beneficially duplicated into // them. void MachineBlockPlacement::findDuplicateCandidates( - SmallVectorImpl &Candidates, - MachineBasicBlock *BB, + SmallVectorImpl &Candidates, MachineBasicBlock *BB, BlockFilterSet *BlockFilter) { MachineBasicBlock *Fallthrough = nullptr; BranchProbability DefaultBranchProb = BranchProbability::getZero(); @@ -3407,31 +3425,53 @@ void MachineBlockPlacement::findDuplicateCandidates( } } -void MachineBlockPlacement::initDupThreshold() { +void MachineBlockPlacement::initTailDupThreshold() { DupThreshold = BlockFrequency(0); - if (!F->getFunction().hasProfileData()) - return; + if (F->getFunction().hasProfileData()) { + // We prefer to use prifile count. + uint64_t HotThreshold = PSI->getOrCompHotCountThreshold(); + if (HotThreshold != UINT64_MAX) { + UseProfileCount = true; + DupThreshold = + BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100); + } else { + // Profile count is not available, we can use block frequency instead. + BlockFrequency MaxFreq = BlockFrequency(0); + for (MachineBasicBlock &MBB : *F) { + BlockFrequency Freq = MBFI->getBlockFreq(&MBB); + if (Freq > MaxFreq) + MaxFreq = Freq; + } - // We prefer to use prifile count. - uint64_t HotThreshold = PSI->getOrCompHotCountThreshold(); - if (HotThreshold != UINT64_MAX) { - UseProfileCount = true; - DupThreshold = - BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100); - return; + BranchProbability ThresholdProb(TailDupPlacementPenalty, 100); + DupThreshold = BlockFrequency(MaxFreq * ThresholdProb); + UseProfileCount = false; + } } - // Profile count is not available, we can use block frequency instead. - BlockFrequency MaxFreq = BlockFrequency(0); - for (MachineBasicBlock &MBB : *F) { - BlockFrequency Freq = MBFI->getBlockFreq(&MBB); - if (Freq > MaxFreq) - MaxFreq = Freq; + TailDupSize = TailDupPlacementThreshold; + // If only the aggressive threshold is explicitly set, use it. + if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 && + TailDupPlacementThreshold.getNumOccurrences() == 0) + TailDupSize = TailDupPlacementAggressiveThreshold; + + // For aggressive optimization, we can adjust some thresholds to be less + // conservative. + if (PassConfig->getOptLevel() >= CodeGenOptLevel::Aggressive) { + // At O3 we should be more willing to copy blocks for tail duplication. This + // increases size pressure, so we only do it at O3 + // Do this unless only the regular threshold is explicitly set. + if (TailDupPlacementThreshold.getNumOccurrences() == 0 || + TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0) + TailDupSize = TailDupPlacementAggressiveThreshold; } - BranchProbability ThresholdProb(TailDupPlacementPenalty, 100); - DupThreshold = BlockFrequency(MaxFreq * ThresholdProb); - UseProfileCount = false; + // If there's no threshold provided through options, query the target + // information for a threshold instead. + if (TailDupPlacementThreshold.getNumOccurrences() == 0 && + (PassConfig->getOptLevel() < CodeGenOptLevel::Aggressive || + TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0)) + TailDupSize = TII->getTailDuplicateSize(PassConfig->getOptLevel()); } bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { @@ -3451,8 +3491,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { TLI = MF.getSubtarget().getTargetLowering(); MPDT = nullptr; PSI = &getAnalysis().getPSI(); - - initDupThreshold(); + PassConfig = &getAnalysis(); // Initialize PreferredLoopExit to nullptr here since it may never be set if // there are no MachineLoops. @@ -3463,38 +3502,17 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { assert(ComputedEdges.empty() && "Computed Edge map should be empty before starting placement."); - unsigned TailDupSize = TailDupPlacementThreshold; - // If only the aggressive threshold is explicitly set, use it. - if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 && - TailDupPlacementThreshold.getNumOccurrences() == 0) - TailDupSize = TailDupPlacementAggressiveThreshold; - - TargetPassConfig *PassConfig = &getAnalysis(); - // For aggressive optimization, we can adjust some thresholds to be less - // conservative. - if (PassConfig->getOptLevel() >= CodeGenOptLevel::Aggressive) { - // At O3 we should be more willing to copy blocks for tail duplication. This - // increases size pressure, so we only do it at O3 - // Do this unless only the regular threshold is explicitly set. - if (TailDupPlacementThreshold.getNumOccurrences() == 0 || - TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0) - TailDupSize = TailDupPlacementAggressiveThreshold; - } - - // If there's no threshold provided through options, query the target - // information for a threshold instead. - if (TailDupPlacementThreshold.getNumOccurrences() == 0 && - (PassConfig->getOptLevel() < CodeGenOptLevel::Aggressive || - TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0)) - TailDupSize = TII->getTailDuplicateSize(PassConfig->getOptLevel()); + // Initialize tail duplication thresholds. + initTailDupThreshold(); + // Apply tail duplication. if (allowTailDupPlacement()) { MPDT = &getAnalysis().getPostDomTree(); bool OptForSize = MF.getFunction().hasOptSize() || llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI()); if (OptForSize) TailDupSize = 1; - bool PreRegAlloc = false; + const bool PreRegAlloc = false; TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI, /* LayoutMode */ true, TailDupSize); precomputeTriangleChains(); @@ -3505,12 +3523,12 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { // Changing the layout can create new tail merging opportunities. // TailMerge can create jump into if branches that make CFG irreducible for // HW that requires structured CFG. - bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() && - PassConfig->getEnableTailMerge() && - BranchFoldPlacement; + const bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() && + PassConfig->getEnableTailMerge() && + BranchFoldPlacement && MF.size() > 3; // No tail merging opportunities if the block number is less than four. - if (MF.size() > 3 && EnableTailMerge) { - unsigned TailMergeSize = TailDupSize + 1; + if (EnableTailMerge) { + const unsigned TailMergeSize = TailDupSize + 1; BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI, *MBPI, PSI, TailMergeSize); @@ -3545,32 +3563,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { ComputedEdges.clear(); ChainAllocator.DestroyAll(); - bool HasMaxBytesOverride = - MaxBytesForAlignmentOverride.getNumOccurrences() > 0; - - if (AlignAllBlock) - // Align all of the blocks in the function to a specific alignment. - for (MachineBasicBlock &MBB : MF) { - if (HasMaxBytesOverride) - MBB.setAlignment(Align(1ULL << AlignAllBlock), - MaxBytesForAlignmentOverride); - else - MBB.setAlignment(Align(1ULL << AlignAllBlock)); - } - else if (AlignAllNonFallThruBlocks) { - // Align all of the blocks that have no fall-through predecessors to a - // specific alignment. - for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) { - auto LayoutPred = std::prev(MBI); - if (!LayoutPred->isSuccessor(&*MBI)) { - if (HasMaxBytesOverride) - MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks), - MaxBytesForAlignmentOverride); - else - MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks)); - } - } - } + // View the function. if (ViewBlockLayoutWithBFI != GVDT_None && (ViewBlockFreqFuncName.empty() || F->getFunction().getName() == ViewBlockFreqFuncName)) { @@ -3705,7 +3698,7 @@ void MachineBlockPlacement::assignBlockOrder( #ifndef NDEBUG // Make sure we correctly constructed all branches. - F->verify(this, "After optimized block reordering"); + F->verify(this, "After optimized block reordering", &errs()); #endif } diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 4e6d34346b1d80..9b2862de22b690 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -453,7 +453,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { if (VerifyScheduling) { LLVM_DEBUG(LIS->dump()); - MF->verify(this, "Before machine scheduling."); + MF->verify(this, "Before machine scheduling.", &errs()); } RegClassInfo->runOnMachineFunction(*MF); @@ -472,7 +472,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { LLVM_DEBUG(LIS->dump()); if (VerifyScheduling) - MF->verify(this, "After machine scheduling."); + MF->verify(this, "After machine scheduling.", &errs()); return true; } @@ -496,7 +496,7 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) { AA = &getAnalysis().getAAResults(); if (VerifyScheduling) - MF->verify(this, "Before post machine scheduling."); + MF->verify(this, "Before post machine scheduling.", &errs()); // Instantiate the selected scheduler for this target, function, and // optimization level. @@ -512,7 +512,7 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) { scheduleRegions(*Scheduler, true); if (VerifyScheduling) - MF->verify(this, "After post machine scheduling."); + MF->verify(this, "After post machine scheduling.", &errs()); return true; } diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index e1295ec8ea6e9a..24a0f41775cc1d 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -94,21 +94,24 @@ using namespace llvm; namespace { struct MachineVerifier { - MachineVerifier(MachineFunctionAnalysisManager &MFAM, const char *b) - : MFAM(&MFAM), Banner(b) {} + MachineVerifier(MachineFunctionAnalysisManager &MFAM, const char *b, + raw_ostream *OS) + : MFAM(&MFAM), OS(OS ? *OS : nulls()), Banner(b) {} - MachineVerifier(Pass *pass, const char *b) : PASS(pass), Banner(b) {} + MachineVerifier(Pass *pass, const char *b, raw_ostream *OS) + : PASS(pass), OS(OS ? *OS : nulls()), Banner(b) {} MachineVerifier(const char *b, LiveVariables *LiveVars, LiveIntervals *LiveInts, LiveStacks *LiveStks, - SlotIndexes *Indexes) - : Banner(b), LiveVars(LiveVars), LiveInts(LiveInts), LiveStks(LiveStks), - Indexes(Indexes) {} + SlotIndexes *Indexes, raw_ostream *OS) + : OS(OS ? *OS : nulls()), Banner(b), LiveVars(LiveVars), + LiveInts(LiveInts), LiveStks(LiveStks), Indexes(Indexes) {} unsigned verify(const MachineFunction &MF); MachineFunctionAnalysisManager *MFAM = nullptr; Pass *const PASS = nullptr; + raw_ostream &OS; const char *Banner; const MachineFunction *MF = nullptr; const TargetMachine *TM = nullptr; @@ -334,7 +337,8 @@ namespace { MachineFunctionProperties::Property::FailsVerification)) return false; - unsigned FoundErrors = MachineVerifier(this, Banner.c_str()).verify(MF); + unsigned FoundErrors = + MachineVerifier(this, Banner.c_str(), &errs()).verify(MF); if (FoundErrors) report_fatal_error("Found "+Twine(FoundErrors)+" machine code errors."); return false; @@ -352,7 +356,8 @@ MachineVerifierPass::run(MachineFunction &MF, if (MF.getProperties().hasProperty( MachineFunctionProperties::Property::FailsVerification)) return PreservedAnalyses::all(); - unsigned FoundErrors = MachineVerifier(MFAM, Banner.c_str()).verify(MF); + unsigned FoundErrors = + MachineVerifier(MFAM, Banner.c_str(), &errs()).verify(MF); if (FoundErrors) report_fatal_error("Found " + Twine(FoundErrors) + " machine code errors."); return PreservedAnalyses::all(); @@ -374,25 +379,28 @@ void llvm::verifyMachineFunction(const std::string &Banner, // LiveIntervals *LiveInts; // LiveStacks *LiveStks; // SlotIndexes *Indexes; - unsigned FoundErrors = MachineVerifier(nullptr, Banner.c_str()).verify(MF); + unsigned FoundErrors = + MachineVerifier(nullptr, Banner.c_str(), &errs()).verify(MF); if (FoundErrors) report_fatal_error("Found " + Twine(FoundErrors) + " machine code errors."); } -bool MachineFunction::verify(Pass *p, const char *Banner, bool AbortOnErrors) - const { +bool MachineFunction::verify(Pass *p, const char *Banner, raw_ostream *OS, + bool AbortOnErrors) const { MachineFunction &MF = const_cast(*this); - unsigned FoundErrors = MachineVerifier(p, Banner).verify(MF); + unsigned FoundErrors = MachineVerifier(p, Banner, OS).verify(MF); if (AbortOnErrors && FoundErrors) report_fatal_error("Found "+Twine(FoundErrors)+" machine code errors."); return FoundErrors == 0; } bool MachineFunction::verify(LiveIntervals *LiveInts, SlotIndexes *Indexes, - const char *Banner, bool AbortOnErrors) const { + const char *Banner, raw_ostream *OS, + bool AbortOnErrors) const { MachineFunction &MF = const_cast(*this); unsigned FoundErrors = - MachineVerifier(Banner, nullptr, LiveInts, nullptr, Indexes).verify(MF); + MachineVerifier(Banner, nullptr, LiveInts, nullptr, Indexes, OS) + .verify(MF); if (AbortOnErrors && FoundErrors) report_fatal_error("Found " + Twine(FoundErrors) + " machine code errors."); return FoundErrors == 0; @@ -482,7 +490,7 @@ unsigned MachineVerifier::verify(const MachineFunction &MF) { for (const MachineInstr &MI : MBB.instrs()) { if (MI.getParent() != &MBB) { report("Bad instruction parent pointer", &MBB); - errs() << "Instruction: " << MI; + OS << "Instruction: " << MI; continue; } @@ -540,46 +548,48 @@ unsigned MachineVerifier::verify(const MachineFunction &MF) { void MachineVerifier::report(const char *msg, const MachineFunction *MF) { assert(MF); - errs() << '\n'; + OS << '\n'; if (!foundErrors++) { if (Banner) - errs() << "# " << Banner << '\n'; + OS << "# " << Banner << '\n'; + if (LiveInts != nullptr) - LiveInts->print(errs()); + LiveInts->print(OS); else - MF->print(errs(), Indexes); + MF->print(OS, Indexes); } - errs() << "*** Bad machine code: " << msg << " ***\n" - << "- function: " << MF->getName() << "\n"; + + OS << "*** Bad machine code: " << msg << " ***\n" + << "- function: " << MF->getName() << '\n'; } void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB) { assert(MBB); report(msg, MBB->getParent()); - errs() << "- basic block: " << printMBBReference(*MBB) << ' ' - << MBB->getName() << " (" << (const void *)MBB << ')'; + OS << "- basic block: " << printMBBReference(*MBB) << ' ' << MBB->getName() + << " (" << (const void *)MBB << ')'; if (Indexes) - errs() << " [" << Indexes->getMBBStartIdx(MBB) - << ';' << Indexes->getMBBEndIdx(MBB) << ')'; - errs() << '\n'; + OS << " [" << Indexes->getMBBStartIdx(MBB) << ';' + << Indexes->getMBBEndIdx(MBB) << ')'; + OS << '\n'; } void MachineVerifier::report(const char *msg, const MachineInstr *MI) { assert(MI); report(msg, MI->getParent()); - errs() << "- instruction: "; + OS << "- instruction: "; if (Indexes && Indexes->hasIndex(*MI)) - errs() << Indexes->getInstructionIndex(*MI) << '\t'; - MI->print(errs(), /*IsStandalone=*/true); + OS << Indexes->getInstructionIndex(*MI) << '\t'; + MI->print(OS, /*IsStandalone=*/true); } void MachineVerifier::report(const char *msg, const MachineOperand *MO, unsigned MONum, LLT MOVRegType) { assert(MO); report(msg, MO->getParent()); - errs() << "- operand " << MONum << ": "; - MO->print(errs(), MOVRegType, TRI); - errs() << "\n"; + OS << "- operand " << MONum << ": "; + MO->print(OS, MOVRegType, TRI); + OS << '\n'; } void MachineVerifier::report(const Twine &Msg, const MachineInstr *MI) { @@ -587,11 +597,11 @@ void MachineVerifier::report(const Twine &Msg, const MachineInstr *MI) { } void MachineVerifier::report_context(SlotIndex Pos) const { - errs() << "- at: " << Pos << '\n'; + OS << "- at: " << Pos << '\n'; } void MachineVerifier::report_context(const LiveInterval &LI) const { - errs() << "- interval: " << LI << '\n'; + OS << "- interval: " << LI << '\n'; } void MachineVerifier::report_context(const LiveRange &LR, Register VRegUnit, @@ -603,35 +613,35 @@ void MachineVerifier::report_context(const LiveRange &LR, Register VRegUnit, } void MachineVerifier::report_context(const LiveRange::Segment &S) const { - errs() << "- segment: " << S << '\n'; + OS << "- segment: " << S << '\n'; } void MachineVerifier::report_context(const VNInfo &VNI) const { - errs() << "- ValNo: " << VNI.id << " (def " << VNI.def << ")\n"; + OS << "- ValNo: " << VNI.id << " (def " << VNI.def << ")\n"; } void MachineVerifier::report_context_liverange(const LiveRange &LR) const { - errs() << "- liverange: " << LR << '\n'; + OS << "- liverange: " << LR << '\n'; } void MachineVerifier::report_context(MCPhysReg PReg) const { - errs() << "- p. register: " << printReg(PReg, TRI) << '\n'; + OS << "- p. register: " << printReg(PReg, TRI) << '\n'; } void MachineVerifier::report_context_vreg(Register VReg) const { - errs() << "- v. register: " << printReg(VReg, TRI) << '\n'; + OS << "- v. register: " << printReg(VReg, TRI) << '\n'; } void MachineVerifier::report_context_vreg_regunit(Register VRegOrUnit) const { if (VRegOrUnit.isVirtual()) { report_context_vreg(VRegOrUnit); } else { - errs() << "- regunit: " << printRegUnit(VRegOrUnit, TRI) << '\n'; + OS << "- regunit: " << printRegUnit(VRegOrUnit, TRI) << '\n'; } } void MachineVerifier::report_context_lanemask(LaneBitmask LaneMask) const { - errs() << "- lanemask: " << PrintLaneMask(LaneMask) << '\n'; + OS << "- lanemask: " << PrintLaneMask(LaneMask) << '\n'; } void MachineVerifier::markReachable(const MachineBasicBlock *MBB) { @@ -710,8 +720,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { report("MBB has successor that isn't part of the function.", MBB); if (!MBBInfoMap[succ].Preds.count(MBB)) { report("Inconsistent CFG", MBB); - errs() << "MBB is not in the predecessor list of the successor " - << printMBBReference(*succ) << ".\n"; + OS << "MBB is not in the predecessor list of the successor " + << printMBBReference(*succ) << ".\n"; } } @@ -721,8 +731,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { report("MBB has predecessor that isn't part of the function.", MBB); if (!MBBInfoMap[Pred].Succs.count(MBB)) { report("Inconsistent CFG", MBB); - errs() << "MBB is not in the successor list of the predecessor " - << printMBBReference(*Pred) << ".\n"; + OS << "MBB is not in the successor list of the predecessor " + << printMBBReference(*Pred) << ".\n"; } } @@ -880,7 +890,7 @@ void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) { SlotIndex idx = Indexes->getInstructionIndex(*MI); if (!(idx > lastIndex)) { report("Instruction index out of order", MI); - errs() << "Last instruction was at " << lastIndex << '\n'; + OS << "Last instruction was at " << lastIndex << '\n'; } lastIndex = idx; } @@ -894,7 +904,7 @@ void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) { // precede non-terminators. if (FirstTerminator->getOpcode() != TargetOpcode::G_INVOKE_REGION_START) { report("Non-terminator instruction after the first terminator", MI); - errs() << "First terminator was:\t" << *FirstTerminator; + OS << "First terminator was:\t" << *FirstTerminator; } } } @@ -2185,8 +2195,8 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) { const MCInstrDesc &MCID = MI->getDesc(); if (MI->getNumOperands() < MCID.getNumOperands()) { report("Too few operands", MI); - errs() << MCID.getNumOperands() << " operands expected, but " - << MI->getNumOperands() << " given.\n"; + OS << MCID.getNumOperands() << " operands expected, but " + << MI->getNumOperands() << " given.\n"; } if (MI->getFlag(MachineInstr::NoConvergent) && !MCID.isConvergent()) @@ -2278,7 +2288,7 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) { // If both types are valid, check that the types are the same. if (SrcTy != DstTy) { report("Copy Instruction is illegal with mismatching types", MI); - errs() << "Def = " << DstTy << ", Src = " << SrcTy << "\n"; + OS << "Def = " << DstTy << ", Src = " << SrcTy << '\n'; } break; @@ -2322,8 +2332,7 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) { if (SrcSize.isNonZero() && DstSize.isNonZero() && SrcSize != DstSize) { if (!DstOp.getSubReg() && !SrcOp.getSubReg()) { report("Copy Instruction is illegal with mismatching sizes", MI); - errs() << "Def Size = " << DstSize << ", Src Size = " << SrcSize - << "\n"; + OS << "Def Size = " << DstSize << ", Src Size = " << SrcSize << '\n'; } } break; @@ -2554,8 +2563,8 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { TII->getRegClass(MCID, MONum, TRI, *MF)) { if (!DRC->contains(Reg)) { report("Illegal physical register for instruction", MO, MONum); - errs() << printReg(Reg, TRI) << " is not a " - << TRI->getRegClassName(DRC) << " register.\n"; + OS << printReg(Reg, TRI) << " is not a " + << TRI->getRegClassName(DRC) << " register.\n"; } } } @@ -2618,9 +2627,9 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { RBI->getMaximumSize(RegBank->getID()) < Ty.getSizeInBits()) { report("Register bank is too small for virtual register", MO, MONum); - errs() << "Register bank " << RegBank->getName() << " too small(" - << RBI->getMaximumSize(RegBank->getID()) << ") to fit " - << Ty.getSizeInBits() << "-bits\n"; + OS << "Register bank " << RegBank->getName() << " too small(" + << RBI->getMaximumSize(RegBank->getID()) << ") to fit " + << Ty.getSizeInBits() << "-bits\n"; return; } } @@ -2639,10 +2648,9 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { TII->getRegClass(MCID, MONum, TRI, *MF)) { report("Virtual register does not match instruction constraint", MO, MONum); - errs() << "Expect register class " - << TRI->getRegClassName( - TII->getRegClass(MCID, MONum, TRI, *MF)) - << " but got nothing\n"; + OS << "Expect register class " + << TRI->getRegClassName(TII->getRegClass(MCID, MONum, TRI, *MF)) + << " but got nothing\n"; return; } @@ -2653,14 +2661,14 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { TRI->getSubClassWithSubReg(RC, SubIdx); if (!SRC) { report("Invalid subregister index for virtual register", MO, MONum); - errs() << "Register class " << TRI->getRegClassName(RC) - << " does not support subreg index " << SubIdx << "\n"; + OS << "Register class " << TRI->getRegClassName(RC) + << " does not support subreg index " << SubIdx << '\n'; return; } if (RC != SRC) { report("Invalid register class for subregister index", MO, MONum); - errs() << "Register class " << TRI->getRegClassName(RC) - << " does not fully support subreg index " << SubIdx << "\n"; + OS << "Register class " << TRI->getRegClassName(RC) + << " does not fully support subreg index " << SubIdx << '\n'; return; } } @@ -2682,9 +2690,9 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { } if (!RC->hasSuperClassEq(DRC)) { report("Illegal virtual register for instruction", MO, MONum); - errs() << "Expected a " << TRI->getRegClassName(DRC) - << " register, but got a " << TRI->getRegClassName(RC) - << " register\n"; + OS << "Expected a " << TRI->getRegClassName(DRC) + << " register, but got a " << TRI->getRegClassName(RC) + << " register\n"; } } } @@ -2733,11 +2741,11 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { } if (loads && !LI.liveAt(Idx.getRegSlot(true))) { report("Instruction loads from dead spill slot", MO, MONum); - errs() << "Live stack: " << LI << '\n'; + OS << "Live stack: " << LI << '\n'; } if (stores && !LI.liveAt(Idx.getRegSlot())) { report("Instruction stores to dead spill slot", MO, MONum); - errs() << "Live stack: " << LI << '\n'; + OS << "Live stack: " << LI << '\n'; } } break; @@ -3050,8 +3058,8 @@ MachineVerifier::visitMachineBasicBlockAfter(const MachineBasicBlock *MBB) { SlotIndex stop = Indexes->getMBBEndIdx(MBB); if (!(stop > lastIndex)) { report("Block ends before last instruction index", MBB); - errs() << "Block ends at " << stop - << " last instruction was at " << lastIndex << '\n'; + OS << "Block ends at " << stop << " last instruction was at " << lastIndex + << '\n'; } lastIndex = stop; } @@ -3296,8 +3304,8 @@ void MachineVerifier::checkPHIOps(const MachineBasicBlock &MBB) { for (MachineBasicBlock *Pred : MBB.predecessors()) { if (!seen.count(Pred)) { report("Missing PHI operand", &Phi); - errs() << printMBBReference(*Pred) - << " is a predecessor according to the CFG.\n"; + OS << printMBBReference(*Pred) + << " is a predecessor according to the CFG.\n"; } } } @@ -3306,9 +3314,10 @@ void MachineVerifier::checkPHIOps(const MachineBasicBlock &MBB) { static void verifyConvergenceControl(const MachineFunction &MF, MachineDominatorTree &DT, - std::function FailureCB) { + std::function FailureCB, + raw_ostream &OS) { MachineConvergenceVerifier CV; - CV.initialize(&errs(), FailureCB, MF); + CV.initialize(&OS, FailureCB, MF); for (const auto &MBB : MF) { CV.visit(MBB); @@ -3326,7 +3335,7 @@ void MachineVerifier::visitMachineFunctionAfter() { auto FailureCB = [this](const Twine &Message) { report(Message.str().c_str(), MF); }; - verifyConvergenceControl(*MF, DT, FailureCB); + verifyConvergenceControl(*MF, DT, FailureCB, OS); calcRegsPassed(); @@ -3342,8 +3351,8 @@ void MachineVerifier::visitMachineFunctionAfter() { for (Register VReg : MInfo.vregsRequired) if (MInfo.regsKilled.count(VReg)) { report("Virtual register killed in block, but needed live out.", &MBB); - errs() << "Virtual register " << printReg(VReg) - << " is used after the block.\n"; + OS << "Virtual register " << printReg(VReg) + << " is used after the block.\n"; } } @@ -3379,9 +3388,8 @@ void MachineVerifier::visitMachineFunctionAfter() { if (!PInfo.regsLiveOut.count(LiveInReg)) { report("Live in register not found to be live out from predecessor.", &MBB); - errs() << TRI->getName(LiveInReg) - << " not found to be live out from " - << printMBBReference(*Pred) << "\n"; + OS << TRI->getName(LiveInReg) << " not found to be live out from " + << printMBBReference(*Pred) << '\n'; } } } @@ -3418,14 +3426,14 @@ void MachineVerifier::verifyLiveVariables() { if (MInfo.vregsRequired.count(Reg)) { if (!VI.AliveBlocks.test(MBB.getNumber())) { report("LiveVariables: Block missing from AliveBlocks", &MBB); - errs() << "Virtual register " << printReg(Reg) - << " must be live through the block.\n"; + OS << "Virtual register " << printReg(Reg) + << " must be live through the block.\n"; } } else { if (VI.AliveBlocks.test(MBB.getNumber())) { report("LiveVariables: Block should not be in AliveBlocks", &MBB); - errs() << "Virtual register " << printReg(Reg) - << " is not needed live through the block.\n"; + OS << "Virtual register " << printReg(Reg) + << " is not needed live through the block.\n"; } } } @@ -3443,7 +3451,7 @@ void MachineVerifier::verifyLiveIntervals() { if (!LiveInts->hasInterval(Reg)) { report("Missing live interval for virtual register", MF); - errs() << printReg(Reg, TRI) << " still has defs or uses\n"; + OS << printReg(Reg, TRI) << " still has defs or uses\n"; continue; } @@ -3755,9 +3763,9 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, report("Register not marked live out of predecessor", Pred); report_context(LR, Reg, LaneMask); report_context(*VNI); - errs() << " live into " << printMBBReference(*MFI) << '@' - << LiveInts->getMBBStartIdx(&*MFI) << ", not live before " - << PEnd << '\n'; + OS << " live into " << printMBBReference(*MFI) << '@' + << LiveInts->getMBBStartIdx(&*MFI) << ", not live before " << PEnd + << '\n'; continue; } @@ -3765,10 +3773,10 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, if (!IsPHI && PVNI != VNI) { report("Different value live out of predecessor", Pred); report_context(LR, Reg, LaneMask); - errs() << "Valno #" << PVNI->id << " live out of " - << printMBBReference(*Pred) << '@' << PEnd << "\nValno #" - << VNI->id << " live into " << printMBBReference(*MFI) << '@' - << LiveInts->getMBBStartIdx(&*MFI) << '\n'; + OS << "Valno #" << PVNI->id << " live out of " + << printMBBReference(*Pred) << '@' << PEnd << "\nValno #" << VNI->id + << " live into " << printMBBReference(*MFI) << '@' + << LiveInts->getMBBStartIdx(&*MFI) << '\n'; } } if (&*MFI == EndMBB) @@ -3823,11 +3831,11 @@ void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) { report("Multiple connected components in live interval", MF); report_context(LI); for (unsigned comp = 0; comp != NumComp; ++comp) { - errs() << comp << ": valnos"; + OS << comp << ": valnos"; for (const VNInfo *I : LI.valnos) if (comp == ConEQ.getEqClass(I)) - errs() << ' ' << I->id; - errs() << '\n'; + OS << ' ' << I->id; + OS << '\n'; } } } @@ -3889,9 +3897,9 @@ void MachineVerifier::verifyStackFrame() { report("Call frame size on entry does not match value computed from " "predecessor", MBB); - errs() << "Call frame size on entry " << MBB->getCallFrameSize() - << " does not match value computed from predecessor " - << -BBState.EntryValue << '\n'; + OS << "Call frame size on entry " << MBB->getCallFrameSize() + << " does not match value computed from predecessor " + << -BBState.EntryValue << '\n'; } // Update stack state by checking contents of MBB. @@ -3914,8 +3922,8 @@ void MachineVerifier::verifyStackFrame() { BBState.ExitValue; if (BBState.ExitIsSetup && AbsSPAdj != Size) { report("FrameDestroy is after FrameSetup ", &I); - errs() << "FrameDestroy <" << Size << "> is after FrameSetup <" - << AbsSPAdj << ">.\n"; + OS << "FrameDestroy <" << Size << "> is after FrameSetup <" + << AbsSPAdj << ">.\n"; } if (!MRI->isSSA() && !MF->getFrameInfo().adjustsStack()) report("AdjustsStack not set in presence of a frame pseudo " @@ -3933,11 +3941,11 @@ void MachineVerifier::verifyStackFrame() { (SPState[Pred->getNumber()].ExitValue != BBState.EntryValue || SPState[Pred->getNumber()].ExitIsSetup != BBState.EntryIsSetup)) { report("The exit stack state of a predecessor is inconsistent.", MBB); - errs() << "Predecessor " << printMBBReference(*Pred) - << " has exit state (" << SPState[Pred->getNumber()].ExitValue - << ", " << SPState[Pred->getNumber()].ExitIsSetup << "), while " - << printMBBReference(*MBB) << " has entry state (" - << BBState.EntryValue << ", " << BBState.EntryIsSetup << ").\n"; + OS << "Predecessor " << printMBBReference(*Pred) << " has exit state (" + << SPState[Pred->getNumber()].ExitValue << ", " + << SPState[Pred->getNumber()].ExitIsSetup << "), while " + << printMBBReference(*MBB) << " has entry state (" + << BBState.EntryValue << ", " << BBState.EntryIsSetup << ").\n"; } } @@ -3948,11 +3956,11 @@ void MachineVerifier::verifyStackFrame() { (SPState[Succ->getNumber()].EntryValue != BBState.ExitValue || SPState[Succ->getNumber()].EntryIsSetup != BBState.ExitIsSetup)) { report("The entry stack state of a successor is inconsistent.", MBB); - errs() << "Successor " << printMBBReference(*Succ) - << " has entry state (" << SPState[Succ->getNumber()].EntryValue - << ", " << SPState[Succ->getNumber()].EntryIsSetup << "), while " - << printMBBReference(*MBB) << " has exit state (" - << BBState.ExitValue << ", " << BBState.ExitIsSetup << ").\n"; + OS << "Successor " << printMBBReference(*Succ) << " has entry state (" + << SPState[Succ->getNumber()].EntryValue << ", " + << SPState[Succ->getNumber()].EntryIsSetup << "), while " + << printMBBReference(*MBB) << " has exit state (" + << BBState.ExitValue << ", " << BBState.ExitIsSetup << ").\n"; } } diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 5001b4fec58f2e..1ad70c86d68e3d 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -1054,7 +1054,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit, } if (VerifyEnabled) - MF->verify(this, "After splitting live range around region"); + MF->verify(this, "After splitting live range around region", &errs()); } MCRegister RAGreedy::tryRegionSplit(const LiveInterval &VirtReg, @@ -1323,7 +1323,7 @@ unsigned RAGreedy::tryBlockSplit(const LiveInterval &VirtReg, } if (VerifyEnabled) - MF->verify(this, "After splitting live range around basic blocks"); + MF->verify(this, "After splitting live range around basic blocks", &errs()); return 0; } @@ -2507,7 +2507,7 @@ MCRegister RAGreedy::selectOrSplitImpl(const LiveInterval &VirtReg, DebugVars->splitRegister(VirtReg.reg(), LRE.regs(), *LIS); if (VerifyEnabled) - MF->verify(this, "After spilling"); + MF->verify(this, "After spilling", &errs()); } // The live virtual register requesting allocation was spilled, so tell @@ -2711,7 +2711,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { TII = MF->getSubtarget().getInstrInfo(); if (VerifyEnabled) - MF->verify(this, "Before greedy register allocator"); + MF->verify(this, "Before greedy register allocator", &errs()); RegAllocBase::init(getAnalysis(), getAnalysis().getLIS(), @@ -2770,7 +2770,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { tryHintsRecoloring(); if (VerifyEnabled) - MF->verify(this, "Before post optimization"); + MF->verify(this, "Before post optimization", &errs()); postOptimization(); reportStats(); diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 99125200c1a4f1..2e1f498c090d1a 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -4239,7 +4239,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) { JoinSplitEdges = EnableJoinSplits; if (VerifyCoalescing) - MF->verify(this, "Before register coalescing"); + MF->verify(this, "Before register coalescing", &errs()); DbgVRegToValues.clear(); buildVRegToDbgValueMap(fn); @@ -4299,7 +4299,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) { LLVM_DEBUG(dump()); if (VerifyCoalescing) - MF->verify(this, "After register coalescing"); + MF->verify(this, "After register coalescing", &errs()); return true; } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index f5fbc01cd95e96..3c087727a80126 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2326,15 +2326,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, /// Return true if sincos libcall is available. static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) { - RTLIB::Libcall LC; - switch (Node->getSimpleValueType(0).SimpleTy) { - default: llvm_unreachable("Unexpected request for libcall!"); - case MVT::f32: LC = RTLIB::SINCOS_F32; break; - case MVT::f64: LC = RTLIB::SINCOS_F64; break; - case MVT::f80: LC = RTLIB::SINCOS_F80; break; - case MVT::f128: LC = RTLIB::SINCOS_F128; break; - case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break; - } + RTLIB::Libcall LC = RTLIB::getFSINCOS(Node->getSimpleValueType(0).SimpleTy); return TLI.getLibcallName(LC) != nullptr; } @@ -2355,68 +2347,72 @@ static bool useSinCos(SDNode *Node) { } /// Issue libcalls to sincos to compute sin / cos pairs. -void -SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node, - SmallVectorImpl &Results) { - RTLIB::Libcall LC; - switch (Node->getSimpleValueType(0).SimpleTy) { - default: llvm_unreachable("Unexpected request for libcall!"); - case MVT::f32: LC = RTLIB::SINCOS_F32; break; - case MVT::f64: LC = RTLIB::SINCOS_F64; break; - case MVT::f80: LC = RTLIB::SINCOS_F80; break; - case MVT::f128: LC = RTLIB::SINCOS_F128; break; - case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break; +void SelectionDAGLegalize::ExpandSinCosLibCall( + SDNode *Node, SmallVectorImpl &Results) { + EVT VT = Node->getValueType(0); + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + RTLIB::Libcall LC = RTLIB::getFSINCOS(VT); + + // Find users of the node that store the results (and share input chains). The + // destination pointers can be used instead of creating stack allocations. + SDValue StoresInChain{}; + std::array ResultStores = {nullptr}; + for (SDNode *User : Node->uses()) { + if (!ISD::isNormalStore(User)) + continue; + auto *ST = cast(User); + if (!ST->isSimple() || ST->getAddressSpace() != 0 || + ST->getAlign() < DAG.getDataLayout().getABITypeAlign(Ty) || + (StoresInChain && ST->getChain() != StoresInChain) || + Node->isPredecessorOf(ST->getChain().getNode())) + continue; + ResultStores[ST->getValue().getResNo()] = ST; + StoresInChain = ST->getChain(); } - // The input chain to this libcall is the entry node of the function. - // Legalizing the call will automatically add the previous call to the - // dependence. - SDValue InChain = DAG.getEntryNode(); - - EVT RetVT = Node->getValueType(0); - Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); - TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; + TargetLowering::ArgListEntry Entry{}; // Pass the argument. Entry.Node = Node->getOperand(0); - Entry.Ty = RetTy; - Entry.IsSExt = false; - Entry.IsZExt = false; - Args.push_back(Entry); - - // Pass the return address of sin. - SDValue SinPtr = DAG.CreateStackTemporary(RetVT); - Entry.Node = SinPtr; - Entry.Ty = PointerType::getUnqual(RetTy->getContext()); - Entry.IsSExt = false; - Entry.IsZExt = false; + Entry.Ty = Ty; Args.push_back(Entry); - // Also pass the return address of the cos. - SDValue CosPtr = DAG.CreateStackTemporary(RetVT); - Entry.Node = CosPtr; - Entry.Ty = PointerType::getUnqual(RetTy->getContext()); - Entry.IsSExt = false; - Entry.IsZExt = false; - Args.push_back(Entry); + // Pass the output pointers for sin and cos. + SmallVector ResultPtrs{}; + for (StoreSDNode *ST : ResultStores) { + SDValue ResultPtr = ST ? ST->getBasePtr() : DAG.CreateStackTemporary(VT); + Entry.Node = ResultPtr; + Entry.Ty = PointerType::getUnqual(Ty->getContext()); + Args.push_back(Entry); + ResultPtrs.push_back(ResultPtr); + } + SDLoc DL(Node); + SDValue InChain = StoresInChain ? StoresInChain : DAG.getEntryNode(); SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout())); - - SDLoc dl(Node); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(InChain).setLibCallee( + CLI.setDebugLoc(DL).setChain(InChain).setLibCallee( TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee, std::move(Args)); - std::pair CallInfo = TLI.LowerCallTo(CLI); + auto [Call, OutChain] = TLI.LowerCallTo(CLI); - Results.push_back( - DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr, MachinePointerInfo())); - Results.push_back( - DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr, MachinePointerInfo())); + for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) { + MachinePointerInfo PtrInfo; + if (StoreSDNode *ST = ResultStores[ResNo]) { + // Replace store with the library call. + DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain); + PtrInfo = ST->getPointerInfo(); + } else { + PtrInfo = MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), + cast(ResultPtr)->getIndex()); + } + SDValue LoadResult = DAG.getLoad(VT, DL, OutChain, ResultPtr, PtrInfo); + Results.push_back(LoadResult); + } } SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 7eb17d5c1540ae..5b072521e5adf4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -11237,6 +11237,12 @@ void SelectionDAG::salvageDebugInfo(SDNode &N) { if (!N.getHasDebugValue()) return; + auto GetLocationOperand = [](SDNode *Node, unsigned ResNo) { + if (auto *FISDN = dyn_cast(Node)) + return SDDbgOperand::fromFrameIdx(FISDN->getIndex()); + return SDDbgOperand::fromNode(Node, ResNo); + }; + SmallVector ClonedDVs; for (auto *DV : GetDbgValues(&N)) { if (DV->isInvalidated()) @@ -11272,7 +11278,7 @@ void SelectionDAG::salvageDebugInfo(SDNode &N) { if (NewLocOps[i].getKind() != SDDbgOperand::SDNODE || NewLocOps[i].getSDNode() != &N) continue; - NewLocOps[i] = SDDbgOperand::fromNode(N0.getNode(), N0.getResNo()); + NewLocOps[i] = GetLocationOperand(N0.getNode(), N0.getResNo()); if (RHSConstant) { SmallVector ExprOps; DIExpression::appendOffset(ExprOps, Offset); @@ -11327,7 +11333,7 @@ void SelectionDAG::salvageDebugInfo(SDNode &N) { NewLocOps[i].getSDNode() != &N) continue; - NewLocOps[i] = SDDbgOperand::fromNode(N0.getNode(), N0.getResNo()); + NewLocOps[i] = GetLocationOperand(N0.getNode(), N0.getResNo()); DbgExpression = DIExpression::appendOpsToArg(DbgExpression, ExtOps, i); Changed = true; } @@ -11350,7 +11356,11 @@ void SelectionDAG::salvageDebugInfo(SDNode &N) { } for (SDDbgValue *Dbg : ClonedDVs) { - assert(!Dbg->getSDNodes().empty() && + assert((!Dbg->getSDNodes().empty() || + llvm::any_of(Dbg->getLocationOps(), + [&](const SDDbgOperand &Op) { + return Op.getKind() == SDDbgOperand::FRAMEIX; + })) && "Salvaged DbgValue should depend on a new SDNode"); AddDbgValue(Dbg, false); } diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 9fdde454559171..1f49d60c970593 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -400,6 +400,11 @@ RTLIB::Libcall RTLIB::getFREXP(EVT RetVT) { FREXP_PPCF128); } +RTLIB::Libcall RTLIB::getFSINCOS(EVT RetVT) { + return getFPLibCall(RetVT, SINCOS_F32, SINCOS_F64, SINCOS_F80, SINCOS_F128, + SINCOS_PPCF128); +} + RTLIB::Libcall RTLIB::getOutlineAtomicHelper(const Libcall (&LC)[5][4], AtomicOrdering Order, uint64_t MemSize) { diff --git a/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp index bca31253683530..947db9cbcd92d0 100644 --- a/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp +++ b/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp @@ -933,7 +933,7 @@ void DwarfStreamer::emitLineTablePrologueV5IncludeAndFileTable( LineSectionSize += MS->emitULEB128IntValue(StrForm); LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_LNCT_directory_index); - LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_FORM_data1); + LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_FORM_udata); if (HasChecksums) { LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_LNCT_MD5); @@ -952,8 +952,7 @@ void DwarfStreamer::emitLineTablePrologueV5IncludeAndFileTable( // file_names (sequence of file name entries). for (auto File : P.FileNames) { emitLineTableString(P, File.Name, DebugStrPool, DebugLineStrPool); - MS->emitInt8(File.DirIdx); - LineSectionSize += 1; + LineSectionSize += MS->emitULEB128IntValue(File.DirIdx); if (HasChecksums) { MS->emitBinaryData( StringRef(reinterpret_cast(File.Checksum.data()), diff --git a/llvm/lib/DWARFLinker/Parallel/DebugLineSectionEmitter.h b/llvm/lib/DWARFLinker/Parallel/DebugLineSectionEmitter.h index 38357c7f97314c..b035c4b1d6c30d 100644 --- a/llvm/lib/DWARFLinker/Parallel/DebugLineSectionEmitter.h +++ b/llvm/lib/DWARFLinker/Parallel/DebugLineSectionEmitter.h @@ -215,7 +215,7 @@ class DebugLineSectionEmitter { encodeULEB128(FileNameForm, Section.OS); encodeULEB128(dwarf::DW_LNCT_directory_index, Section.OS); - encodeULEB128(dwarf::DW_FORM_data1, Section.OS); + encodeULEB128(dwarf::DW_FORM_udata, Section.OS); if (HasChecksums) { encodeULEB128(dwarf::DW_LNCT_MD5, Section.OS); @@ -242,7 +242,7 @@ class DebugLineSectionEmitter { // A null-terminated string containing the full or relative path name of a // source file. Section.emitString(FileNameForm, *FileNameStr); - Section.emitIntVal(File.DirIdx, 1); + encodeULEB128(File.DirIdx, Section.OS); if (HasChecksums) { assert((File.Checksum.size() == 16) && diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp index b27a1a19acae0a..6a32ccc3776510 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp @@ -156,6 +156,9 @@ class ELFLinkGraphBuilder_x86_64 : public ELFLinkGraphBuilder { case ELF::R_X86_64_PC8: Kind = x86_64::Delta8; break; + case ELF::R_X86_64_PC16: + Kind = x86_64::Delta16; + break; case ELF::R_X86_64_PC32: case ELF::R_X86_64_GOTPC32: Kind = x86_64::Delta32; diff --git a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp index 9f7ece8ffbbb3f..cca4358a377660 100644 --- a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp @@ -34,6 +34,8 @@ const char *getEdgeKindName(Edge::Kind K) { return "Delta64"; case Delta32: return "Delta32"; + case Delta16: + return "Delta16"; case Delta8: return "Delta8"; case NegDelta64: diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 592c0584018c00..4d2c4dda97569b 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1275,6 +1275,16 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, else if (Name.consume_front("rotate.")) // nvvm.rotate.{b32,b64,right.b64} Expand = Name == "b32" || Name == "b64" || Name == "right.b64"; + else if (Name.consume_front("ptr.gen.to.")) + // nvvm.ptr.gen.to.{local,shared,global,constant} + Expand = Name.starts_with("local") || Name.starts_with("shared") || + Name.starts_with("global") || Name.starts_with("constant"); + else if (Name.consume_front("ptr.")) + // nvvm.ptr.{local,shared,global,constant}.to.gen + Expand = + (Name.consume_front("local") || Name.consume_front("shared") || + Name.consume_front("global") || Name.consume_front("constant")) && + Name.starts_with(".to.gen"); else Expand = false; @@ -2338,6 +2348,15 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, Value *ZExtShiftAmt = Builder.CreateZExt(CI->getOperand(1), Int64Ty); Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshr, {Arg, Arg, ZExtShiftAmt}); + } else if ((Name.consume_front("ptr.gen.to.") && + (Name.starts_with("local") || Name.starts_with("shared") || + Name.starts_with("global") || Name.starts_with("constant"))) || + (Name.consume_front("ptr.") && + (Name.consume_front("local") || Name.consume_front("shared") || + Name.consume_front("global") || + Name.consume_front("constant")) && + Name.starts_with(".to.gen"))) { + Rep = Builder.CreateAddrSpaceCast(CI->getArgOperand(0), CI->getType()); } else { Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name); if (IID != Intrinsic::not_intrinsic && diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index a72a0799a08025..0f99b4a13d2f95 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -986,6 +986,32 @@ Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const { } } break; + + case LISTFLATTEN: + if (ListInit *LHSList = dyn_cast(LHS)) { + ListRecTy *InnerListTy = dyn_cast(LHSList->getElementType()); + // list of non-lists, !listflatten() is a NOP. + if (!InnerListTy) + return LHS; + + auto Flatten = [](ListInit *List) -> std::optional> { + std::vector Flattened; + // Concatenate elements of all the inner lists. + for (Init *InnerInit : List->getValues()) { + ListInit *InnerList = dyn_cast(InnerInit); + if (!InnerList) + return std::nullopt; + for (Init *InnerElem : InnerList->getValues()) + Flattened.push_back(InnerElem); + }; + return Flattened; + }; + + auto Flattened = Flatten(LHSList); + if (Flattened) + return ListInit::get(*Flattened, InnerListTy->getElementType()); + } + break; } return const_cast(this); } @@ -1010,6 +1036,9 @@ std::string UnOpInit::getAsString() const { case EMPTY: Result = "!empty"; break; case GETDAGOP: Result = "!getdagop"; break; case LOG2 : Result = "!logtwo"; break; + case LISTFLATTEN: + Result = "!listflatten"; + break; case REPR: Result = "!repr"; break; diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp index 62a884e01a5306..8fe7f69ecf8e59 100644 --- a/llvm/lib/TableGen/TGLexer.cpp +++ b/llvm/lib/TableGen/TGLexer.cpp @@ -628,6 +628,7 @@ tgtok::TokKind TGLexer::LexExclaim() { .Case("foreach", tgtok::XForEach) .Case("filter", tgtok::XFilter) .Case("listconcat", tgtok::XListConcat) + .Case("listflatten", tgtok::XListFlatten) .Case("listsplat", tgtok::XListSplat) .Case("listremove", tgtok::XListRemove) .Case("range", tgtok::XRange) diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h index 9adc03ccc72b85..4fa4d84d0535d3 100644 --- a/llvm/lib/TableGen/TGLexer.h +++ b/llvm/lib/TableGen/TGLexer.h @@ -122,6 +122,7 @@ enum TokKind { XSRL, XSHL, XListConcat, + XListFlatten, XListSplat, XStrConcat, XInterleave, diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp index 1a60c2a567a297..54c9a902ec27a1 100644 --- a/llvm/lib/TableGen/TGParser.cpp +++ b/llvm/lib/TableGen/TGParser.cpp @@ -1190,6 +1190,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) { case tgtok::XNOT: case tgtok::XToLower: case tgtok::XToUpper: + case tgtok::XListFlatten: case tgtok::XLOG2: case tgtok::XHead: case tgtok::XTail: @@ -1235,6 +1236,11 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) { Code = UnOpInit::NOT; Type = IntRecTy::get(Records); break; + case tgtok::XListFlatten: + Lex.Lex(); // eat the operation. + Code = UnOpInit::LISTFLATTEN; + Type = IntRecTy::get(Records); // Bogus type used here. + break; case tgtok::XLOG2: Lex.Lex(); // eat the operation Code = UnOpInit::LOG2; @@ -1309,7 +1315,8 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) { } } - if (Code == UnOpInit::HEAD || Code == UnOpInit::TAIL) { + if (Code == UnOpInit::HEAD || Code == UnOpInit::TAIL || + Code == UnOpInit::LISTFLATTEN) { ListInit *LHSl = dyn_cast(LHS); TypedInit *LHSt = dyn_cast(LHS); if (!LHSl && !LHSt) { @@ -1328,6 +1335,8 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) { TokError("empty list argument in unary operator"); return nullptr; } + bool UseElementType = + Code == UnOpInit::HEAD || Code == UnOpInit::LISTFLATTEN; if (LHSl) { Init *Item = LHSl->getElement(0); TypedInit *Itemt = dyn_cast(Item); @@ -1335,12 +1344,25 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) { TokError("untyped list element in unary operator"); return nullptr; } - Type = (Code == UnOpInit::HEAD) ? Itemt->getType() - : ListRecTy::get(Itemt->getType()); + Type = UseElementType ? Itemt->getType() + : ListRecTy::get(Itemt->getType()); } else { assert(LHSt && "expected list type argument in unary operator"); ListRecTy *LType = dyn_cast(LHSt->getType()); - Type = (Code == UnOpInit::HEAD) ? LType->getElementType() : LType; + Type = UseElementType ? LType->getElementType() : LType; + } + + // for !listflatten, we expect a list of lists, but also support a list of + // non-lists, where !listflatten will be a NOP. + if (Code == UnOpInit::LISTFLATTEN) { + ListRecTy *InnerListTy = dyn_cast(Type); + if (InnerListTy) { + // listflatten will convert list> to list. + Type = ListRecTy::get(InnerListTy->getElementType()); + } else { + // If its a list of non-lists, !listflatten will be a NOP. + Type = ListRecTy::get(Type); + } } } @@ -1378,7 +1400,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) { case tgtok::XExists: { // Value ::= !exists '<' Type '>' '(' Value ')' - Lex.Lex(); // eat the operation + Lex.Lex(); // eat the operation. RecTy *Type = ParseOperatorType(); if (!Type) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 7e041b086599b9..fde07d84e97f58 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -2757,7 +2757,11 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( bool FPOffsetFits = !ForSimm || FPOffset >= -256; PreferFP |= Offset > -FPOffset && !SVEStackSize; - if (MFI.hasVarSizedObjects()) { + if (FPOffset >= 0) { + // If the FPOffset is positive, that'll always be best, as the SP/BP + // will be even further away. + UseFP = true; + } else if (MFI.hasVarSizedObjects()) { // If we have variable sized objects, we can use either FP or BP, as the // SP offset is unknown. We can use the base pointer if we have one and // FP is not preferred. If not, we're stuck with using FP. @@ -2769,11 +2773,6 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( // else we can use BP and FP, but the offset from FP won't fit. // That will make us scavenge registers which we can probably avoid by // using BP. If it won't fit for BP either, we'll scavenge anyway. - } else if (FPOffset >= 0) { - // Use SP or FP, whichever gives us the best chance of the offset - // being in range for direct access. If the FPOffset is positive, - // that'll always be best, as the SP will be even further away. - UseFP = true; } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) { // Funclets access the locals contained in the parent's stack frame // via the frame pointer, so we have to use the FP in the parent diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index da0798ebf79578..ac05a44abc2dd9 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2140,6 +2140,16 @@ static std::optional instCombineSVESrshl(InstCombiner &IC, return IC.replaceInstUsesWith(II, LSL); } +static std::optional instCombineSVEInsr(InstCombiner &IC, + IntrinsicInst &II) { + Value *Vec = II.getOperand(0); + + if (getSplatValue(Vec) == II.getOperand(1)) + return IC.replaceInstUsesWith(II, Vec); + + return std::nullopt; +} + std::optional AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { @@ -2460,6 +2470,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, return instCombineSVESrshl(IC, II); case Intrinsic::aarch64_sve_dupq_lane: return instCombineSVEDupqLane(IC, II); + case Intrinsic::aarch64_sve_insr: + return instCombineSVEInsr(IC, II); } return std::nullopt; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 43a5da12c411a4..301de624d3952d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9549,6 +9549,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Opc = AMDGPU::S_GET_BARRIER_STATE_IMM; SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32); Ops.push_back(K); + Ops.push_back(Chain); } else { Opc = AMDGPU::S_GET_BARRIER_STATE_M0; SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2)); @@ -10215,7 +10216,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, 0); } Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); - } else if (!IsInlinableBarID) { + } else if (IsInlinableBarID) { + Ops.push_back(Chain); + } else { Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0)); } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 56c96ea943b89d..7f942de74bdcc9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1109,11 +1109,21 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { AddrSpaceCastSDNode *CastN = cast(N); unsigned SrcAddrSpace = CastN->getSrcAddressSpace(); unsigned DstAddrSpace = CastN->getDestAddressSpace(); + SDLoc DL(N); assert(SrcAddrSpace != DstAddrSpace && "addrspacecast must be between different address spaces"); if (DstAddrSpace == ADDRESS_SPACE_GENERIC) { // Specific to generic + + if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) { + SDValue CvtNone = + CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32); + SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u64_u32, DL, MVT::i64, + Src, CvtNone); + Src = SDValue(Cvt, 0); + } + unsigned Opc; switch (SrcAddrSpace) { default: report_fatal_error("Bad address space in addrspacecast"); @@ -1121,26 +1131,16 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global; break; case ADDRESS_SPACE_SHARED: - Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32 - ? NVPTX::cvta_shared_6432 - : NVPTX::cvta_shared_64) - : NVPTX::cvta_shared; + Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared; break; case ADDRESS_SPACE_CONST: - Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32 - ? NVPTX::cvta_const_6432 - : NVPTX::cvta_const_64) - : NVPTX::cvta_const; + Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const; break; case ADDRESS_SPACE_LOCAL: - Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32 - ? NVPTX::cvta_local_6432 - : NVPTX::cvta_local_64) - : NVPTX::cvta_local; + Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local; break; } - ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), - Src)); + ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src)); return; } else { // Generic to specific @@ -1153,30 +1153,28 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global; break; case ADDRESS_SPACE_SHARED: - Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32 - ? NVPTX::cvta_to_shared_3264 - : NVPTX::cvta_to_shared_64) - : NVPTX::cvta_to_shared; + Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared; break; case ADDRESS_SPACE_CONST: - Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32 - ? NVPTX::cvta_to_const_3264 - : NVPTX::cvta_to_const_64) - : NVPTX::cvta_to_const; + Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const; break; case ADDRESS_SPACE_LOCAL: - Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32 - ? NVPTX::cvta_to_local_3264 - : NVPTX::cvta_to_local_64) - : NVPTX::cvta_to_local; + Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local; break; case ADDRESS_SPACE_PARAM: - Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64 - : NVPTX::nvvm_ptr_gen_to_param; + Opc = TM.is64Bit() ? NVPTX::IMOV64rr : NVPTX::IMOV32rr; break; } - ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), - Src)); + + SDNode *CVTA = CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src); + if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) { + SDValue CvtNone = + CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32); + CVTA = CurDAG->getMachineNode(NVPTX::CVT_u32_u64, DL, MVT::i32, + SDValue(CVTA, 0), CvtNone); + } + + ReplaceNode(N, CVTA); return; } } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index f6bbf4c2ffc02f..c3a8a774673f24 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -174,10 +174,6 @@ def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">; def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" "&& Subtarget->getPTXVersion() >= 64)">; -def useShortPtrLocal : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_LOCAL) == 32">; -def useShortPtrShared : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32">; -def useShortPtrConst : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_CONST) == 32">; - def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 2688cfbe5e824f..042b0965ea33ff 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2537,59 +2537,45 @@ defm INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>; -multiclass NG_TO_G { +multiclass NG_TO_G { def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), - !strconcat("cvta.", Str, ".u32 \t$result, $src;"), - [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>; + "cvta." # Str # ".u32 \t$result, $src;", []>; def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), - !strconcat("cvta.", Str, ".u64 \t$result, $src;"), - [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>; - def _6432 : NVPTXInst<(outs Int64Regs:$result), (ins Int32Regs:$src), - "{{ .reg .b64 %tmp;\n\t" - #" cvt.u64.u32 \t%tmp, $src;\n\t" - #" cvta." # Str # ".u64 \t$result, %tmp; }}", - [(set Int64Regs:$result, (Intrin Int32Regs:$src))]>, - Requires<[ShortPtr]>; + "cvta." # Str # ".u64 \t$result, $src;", []>; } -multiclass G_TO_NG { +multiclass G_TO_NG { def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), - !strconcat("cvta.to.", Str, ".u32 \t$result, $src;"), - [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>; + "cvta.to." # Str # ".u32 \t$result, $src;", []>; def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), - !strconcat("cvta.to.", Str, ".u64 \t$result, $src;"), - [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>; - def _3264 : NVPTXInst<(outs Int32Regs:$result), (ins Int64Regs:$src), - "{{ .reg .b64 %tmp;\n\t" - #" cvta.to." # Str # ".u64 \t%tmp, $src;\n\t" - #" cvt.u32.u64 \t$result, %tmp; }}", - [(set Int32Regs:$result, (Intrin Int64Regs:$src))]>, - Requires<[ShortPtr]>; -} - -defm cvta_local : NG_TO_G<"local", int_nvvm_ptr_local_to_gen, useShortPtrLocal>; -defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen, useShortPtrShared>; -defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen, False>; -defm cvta_const : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen, useShortPtrConst>; -defm cvta_param : NG_TO_G<"param", int_nvvm_ptr_param_to_gen, False>; - -defm cvta_to_local : G_TO_NG<"local", int_nvvm_ptr_gen_to_local, useShortPtrLocal>; -defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared, useShortPtrShared>; -defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global, False>; -defm cvta_to_const : G_TO_NG<"const", int_nvvm_ptr_gen_to_constant, useShortPtrConst>; + "cvta.to." # Str # ".u64 \t$result, $src;", []>; +} + +defm cvta_local : NG_TO_G<"local">; +defm cvta_shared : NG_TO_G<"shared">; +defm cvta_global : NG_TO_G<"global">; +defm cvta_const : NG_TO_G<"const">; + +defm cvta_to_local : G_TO_NG<"local">; +defm cvta_to_shared : G_TO_NG<"shared">; +defm cvta_to_global : G_TO_NG<"global">; +defm cvta_to_const : G_TO_NG<"const">; + +// nvvm.ptr.param.to.gen +defm cvta_param : NG_TO_G<"param">; + +def : Pat<(int_nvvm_ptr_param_to_gen Int32Regs:$src), + (cvta_param Int32Regs:$src)>; + +def : Pat<(int_nvvm_ptr_param_to_gen Int64Regs:$src), + (cvta_param_64 Int64Regs:$src)>; // nvvm.ptr.gen.to.param -def nvvm_ptr_gen_to_param : NVPTXInst<(outs Int32Regs:$result), - (ins Int32Regs:$src), - "mov.u32 \t$result, $src;", - [(set Int32Regs:$result, - (int_nvvm_ptr_gen_to_param Int32Regs:$src))]>; -def nvvm_ptr_gen_to_param_64 : NVPTXInst<(outs Int64Regs:$result), - (ins Int64Regs:$src), - "mov.u64 \t$result, $src;", - [(set Int64Regs:$result, - (int_nvvm_ptr_gen_to_param Int64Regs:$src))]>; +def : Pat<(int_nvvm_ptr_gen_to_param Int32Regs:$src), + (IMOV32rr Int32Regs:$src)>; +def : Pat<(int_nvvm_ptr_gen_to_param Int64Regs:$src), + (IMOV64rr Int64Regs:$src)>; // nvvm.move intrinsicc def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s), @@ -2632,24 +2618,6 @@ def nvvm_move_sym64 : NVPTXInst<(outs Int64Regs:$r), (ins imem:$s), [(set Int64Regs:$r, (int_nvvm_move_ptr texternalsym:$s))]>;*/ - -// MoveParam %r1, param -// ptr_local_to_gen %r2, %r1 -// ptr_gen_to_local %r3, %r2 -// -> -// mov %r1, param - -// @TODO: Revisit this. There is a type -// contradiction between iPTRAny and iPTR for the addr defs, so the move_sym -// instructions are not currently defined. However, we can use the ptr -// variants and the asm printer will do the right thing. -def : Pat<(i64 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen - (MoveParam texternalsym:$src)))), - (nvvm_move_ptr64 texternalsym:$src)>; -def : Pat<(i32 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen - (MoveParam texternalsym:$src)))), - (nvvm_move_ptr32 texternalsym:$src)>; - def texsurf_handles : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src), "mov.u64 \t$result, $src;", []>; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index b998a1eb11c300..bf822eb2c6eeb5 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -10152,13 +10152,15 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, unsigned OrigIdx = Op.getConstantOperandVal(2); const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo(); + if (OrigIdx == 0 && Vec.isUndef()) + return Op; + // We don't have the ability to slide mask vectors up indexed by their i1 // elements; the smallest we can do is i8. Often we are able to bitcast to // equivalent i8 vectors. Note that when inserting a fixed-length vector // into a scalable one, we might not necessarily have enough scalable // elements to safely divide by 8: nxv1i1 = insert nxv1i1, v4i1 is valid. - if (SubVecVT.getVectorElementType() == MVT::i1 && - (OrigIdx != 0 || !Vec.isUndef())) { + if (SubVecVT.getVectorElementType() == MVT::i1) { if (VecVT.getVectorMinNumElements() >= 8 && SubVecVT.getVectorMinNumElements() >= 8) { assert(OrigIdx % 8 == 0 && "Invalid index"); @@ -10196,8 +10198,6 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, // vector group up the full amount. const auto VLen = Subtarget.getRealVLen(); if (SubVecVT.isFixedLengthVector() && !VLen) { - if (OrigIdx == 0 && Vec.isUndef() && !VecVT.isFixedLengthVector()) - return Op; MVT ContainerVT = VecVT; if (VecVT.isFixedLengthVector()) { ContainerVT = getContainerForFixedLengthVector(VecVT); @@ -10208,11 +10208,6 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, DAG.getUNDEF(ContainerVT), SubVec, DAG.getVectorIdxConstant(0, DL)); - if (OrigIdx == 0 && Vec.isUndef() && VecVT.isFixedLengthVector()) { - SubVec = convertFromScalableVector(VecVT, SubVec, DAG, Subtarget); - return DAG.getBitcast(Op.getValueType(), SubVec); - } - SDValue Mask = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget).first; // Set the vector length to only the number of elements we care about. Note diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index ca3e47a4b78f23..3e1873e899680c 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/Type.h" #include "llvm/Support/Casting.h" #include +#include using namespace llvm; SPIRVGlobalRegistry::SPIRVGlobalRegistry(unsigned PointerSize) @@ -83,8 +84,11 @@ inline Register createTypeVReg(MachineIRBuilder &MIRBuilder) { } SPIRVType *SPIRVGlobalRegistry::getOpTypeBool(MachineIRBuilder &MIRBuilder) { - return MIRBuilder.buildInstr(SPIRV::OpTypeBool) - .addDef(createTypeVReg(MIRBuilder)); + + return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) { + return MIRBuilder.buildInstr(SPIRV::OpTypeBool) + .addDef(createTypeVReg(MIRBuilder)); + }); } unsigned SPIRVGlobalRegistry::adjustOpTypeIntWidth(unsigned Width) const { @@ -118,24 +122,53 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeInt(unsigned Width, MIRBuilder.buildInstr(SPIRV::OpCapability) .addImm(SPIRV::Capability::ArbitraryPrecisionIntegersINTEL); } - auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeInt) - .addDef(createTypeVReg(MIRBuilder)) - .addImm(Width) - .addImm(IsSigned ? 1 : 0); - return MIB; + return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) { + return MIRBuilder.buildInstr(SPIRV::OpTypeInt) + .addDef(createTypeVReg(MIRBuilder)) + .addImm(Width) + .addImm(IsSigned ? 1 : 0); + }); } SPIRVType *SPIRVGlobalRegistry::getOpTypeFloat(uint32_t Width, MachineIRBuilder &MIRBuilder) { - auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeFloat) - .addDef(createTypeVReg(MIRBuilder)) - .addImm(Width); - return MIB; + return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) { + return MIRBuilder.buildInstr(SPIRV::OpTypeFloat) + .addDef(createTypeVReg(MIRBuilder)) + .addImm(Width); + }); } SPIRVType *SPIRVGlobalRegistry::getOpTypeVoid(MachineIRBuilder &MIRBuilder) { - return MIRBuilder.buildInstr(SPIRV::OpTypeVoid) - .addDef(createTypeVReg(MIRBuilder)); + return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) { + return MIRBuilder.buildInstr(SPIRV::OpTypeVoid) + .addDef(createTypeVReg(MIRBuilder)); + }); +} + +SPIRVType *SPIRVGlobalRegistry::createOpType( + MachineIRBuilder &MIRBuilder, + std::function Op) { + auto oldInsertPoint = MIRBuilder.getInsertPt(); + MachineBasicBlock *OldMBB = &MIRBuilder.getMBB(); + + auto LastInsertedType = LastInsertedTypeMap.find(CurMF); + if (LastInsertedType != LastInsertedTypeMap.end()) { + MIRBuilder.setInsertPt(*MIRBuilder.getMF().begin(), + LastInsertedType->second->getIterator()); + } else { + MIRBuilder.setInsertPt(*MIRBuilder.getMF().begin(), + MIRBuilder.getMF().begin()->begin()); + auto Result = LastInsertedTypeMap.try_emplace(CurMF, nullptr); + assert(Result.second); + LastInsertedType = Result.first; + } + + MachineInstr *Type = Op(MIRBuilder); + LastInsertedType->second = Type; + + MIRBuilder.setInsertPt(*OldMBB, oldInsertPoint); + return Type; } SPIRVType *SPIRVGlobalRegistry::getOpTypeVector(uint32_t NumElems, @@ -147,11 +180,12 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeVector(uint32_t NumElems, EleOpc == SPIRV::OpTypeBool) && "Invalid vector element type"); - auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeVector) - .addDef(createTypeVReg(MIRBuilder)) - .addUse(getSPIRVTypeID(ElemType)) - .addImm(NumElems); - return MIB; + return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) { + return MIRBuilder.buildInstr(SPIRV::OpTypeVector) + .addDef(createTypeVReg(MIRBuilder)) + .addUse(getSPIRVTypeID(ElemType)) + .addImm(NumElems); + }); } std::tuple @@ -688,11 +722,12 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeArray(uint32_t NumElems, SPIRVType *SpvTypeInt32 = getOrCreateSPIRVIntegerType(32, MIRBuilder); Register NumElementsVReg = buildConstantInt(NumElems, MIRBuilder, SpvTypeInt32, EmitIR); - auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeArray) - .addDef(createTypeVReg(MIRBuilder)) - .addUse(getSPIRVTypeID(ElemType)) - .addUse(NumElementsVReg); - return MIB; + return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) { + return MIRBuilder.buildInstr(SPIRV::OpTypeArray) + .addDef(createTypeVReg(MIRBuilder)) + .addUse(getSPIRVTypeID(ElemType)) + .addUse(NumElementsVReg); + }); } SPIRVType *SPIRVGlobalRegistry::getOpTypeOpaque(const StructType *Ty, @@ -700,10 +735,12 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeOpaque(const StructType *Ty, assert(Ty->hasName()); const StringRef Name = Ty->hasName() ? Ty->getName() : ""; Register ResVReg = createTypeVReg(MIRBuilder); - auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeOpaque).addDef(ResVReg); - addStringImm(Name, MIB); - buildOpName(ResVReg, Name, MIRBuilder); - return MIB; + return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) { + auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeOpaque).addDef(ResVReg); + addStringImm(Name, MIB); + buildOpName(ResVReg, Name, MIRBuilder); + return MIB; + }); } SPIRVType *SPIRVGlobalRegistry::getOpTypeStruct(const StructType *Ty, @@ -717,14 +754,16 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeStruct(const StructType *Ty, FieldTypes.push_back(getSPIRVTypeID(ElemTy)); } Register ResVReg = createTypeVReg(MIRBuilder); - auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeStruct).addDef(ResVReg); - for (const auto &Ty : FieldTypes) - MIB.addUse(Ty); - if (Ty->hasName()) - buildOpName(ResVReg, Ty->getName(), MIRBuilder); - if (Ty->isPacked()) - buildOpDecorate(ResVReg, MIRBuilder, SPIRV::Decoration::CPacked, {}); - return MIB; + return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) { + auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeStruct).addDef(ResVReg); + for (const auto &Ty : FieldTypes) + MIB.addUse(Ty); + if (Ty->hasName()) + buildOpName(ResVReg, Ty->getName(), MIRBuilder); + if (Ty->isPacked()) + buildOpDecorate(ResVReg, MIRBuilder, SPIRV::Decoration::CPacked, {}); + return MIB; + }); } SPIRVType *SPIRVGlobalRegistry::getOrCreateSpecialType( @@ -739,17 +778,22 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypePointer( MachineIRBuilder &MIRBuilder, Register Reg) { if (!Reg.isValid()) Reg = createTypeVReg(MIRBuilder); - return MIRBuilder.buildInstr(SPIRV::OpTypePointer) - .addDef(Reg) - .addImm(static_cast(SC)) - .addUse(getSPIRVTypeID(ElemType)); + + return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) { + return MIRBuilder.buildInstr(SPIRV::OpTypePointer) + .addDef(Reg) + .addImm(static_cast(SC)) + .addUse(getSPIRVTypeID(ElemType)); + }); } SPIRVType *SPIRVGlobalRegistry::getOpTypeForwardPointer( SPIRV::StorageClass::StorageClass SC, MachineIRBuilder &MIRBuilder) { - return MIRBuilder.buildInstr(SPIRV::OpTypeForwardPointer) - .addUse(createTypeVReg(MIRBuilder)) - .addImm(static_cast(SC)); + return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) { + return MIRBuilder.buildInstr(SPIRV::OpTypeForwardPointer) + .addUse(createTypeVReg(MIRBuilder)) + .addImm(static_cast(SC)); + }); } SPIRVType *SPIRVGlobalRegistry::getOpTypeFunction( diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h index ed9cfc07132430..cad2bf96adf33e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h @@ -64,6 +64,10 @@ class SPIRVGlobalRegistry { SmallPtrSet TypesInProcessing; DenseMap ForwardPointerTypes; + // Stores for each function the last inserted SPIR-V Type. + // See: SPIRVGlobalRegistry::createOpType. + DenseMap LastInsertedTypeMap; + // if a function returns a pointer, this is to map it into TypedPointerType DenseMap FunResPointerTypes; @@ -97,6 +101,13 @@ class SPIRVGlobalRegistry { SPIRV::AccessQualifier::AccessQualifier AccessQual, bool EmitIR); + // Internal function creating the an OpType at the correct position in the + // function by tweaking the passed "MIRBuilder" insertion point and restoring + // it to the correct position. "Op" should be the function creating the + // specific OpType you need, and should return the newly created instruction. + SPIRVType *createOpType(MachineIRBuilder &MIRBuilder, + std::function Op); + public: SPIRVGlobalRegistry(unsigned PointerSize); diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index f1b10e264781f2..cd0aff1a518439 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -389,9 +389,7 @@ void processInstr(MachineInstr &MI, MachineIRBuilder &MIB, createNewIdReg(nullptr, MI.getOperand(0).getReg(), MRI, *GR).first; AssignTypeInst.getOperand(1).setReg(NewReg); MI.getOperand(0).setReg(NewReg); - MIB.setInsertPt(*MI.getParent(), - (MI.getNextNode() ? MI.getNextNode()->getIterator() - : MI.getParent()->end())); + MIB.setInsertPt(*MI.getParent(), MI.getIterator()); for (auto &Op : MI.operands()) { if (!Op.isReg() || Op.isDef()) continue; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 020021e9d8abcb..efaf79054fb9ff 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -52978,10 +52978,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, // combiner. static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - // First instruction should be a right shift of a multiply. - if (Src.getOpcode() != ISD::SRL || - Src.getOperand(0).getOpcode() != ISD::MUL) - return SDValue(); + using namespace llvm::SDPatternMatch; if (!Subtarget.hasSSE2()) return SDValue(); @@ -52996,15 +52993,12 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, if (InVT.getVectorElementType().getSizeInBits() < 32) return SDValue(); - // Need a shift by 16. - APInt ShiftAmt; - if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) || - ShiftAmt != 16) + // First instruction should be a right shift by 16 of a multiply. + SDValue LHS, RHS; + if (!sd_match(Src, + m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_SpecificInt(16)))) return SDValue(); - SDValue LHS = Src.getOperand(0).getOperand(0); - SDValue RHS = Src.getOperand(0).getOperand(1); - // Count leading sign/zero bits on both inputs - if there are enough then // truncation back to vXi16 will be cheap - either as a pack/shuffle // sequence or using AVX512 truncations. If the inputs are sext/zext then the @@ -53022,12 +53016,13 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, return SDValue(); // Check if both inputs are extensions, which will be removed by truncation. - bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND || - LHS.getOpcode() == ISD::ZERO_EXTEND) && - (RHS.getOpcode() == ISD::SIGN_EXTEND || - RHS.getOpcode() == ISD::ZERO_EXTEND) && - LHS.getOperand(0).getScalarValueSizeInBits() <= 16 && - RHS.getOperand(0).getScalarValueSizeInBits() <= 16; + auto isOpTruncateFree = [](SDValue Op) { + if (Op.getOpcode() == ISD::SIGN_EXTEND || + Op.getOpcode() == ISD::ZERO_EXTEND) + return Op.getOperand(0).getScalarValueSizeInBits() <= 16; + return ISD::isBuildVectorOfConstantSDNodes(Op.getNode()); + }; + bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS); // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on // the (bitcasted) inputs directly, and then cheaply pack/truncate the result diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index ef9c264482a640..0e2b5c925a6a7a 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -194,7 +194,6 @@ void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold, SelectInst *SI = SIToUnfold.getInst(); PHINode *SIUse = SIToUnfold.getUse(); BasicBlock *StartBlock = SI->getParent(); - BasicBlock *EndBlock = SIUse->getParent(); BranchInst *StartBlockTerm = dyn_cast(StartBlock->getTerminator()); @@ -202,6 +201,7 @@ void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold, assert(SI->hasOneUse()); if (StartBlockTerm->isUnconditional()) { + BasicBlock *EndBlock = StartBlock->getUniqueSuccessor(); // Arbitrarily choose the 'false' side for a new input value to the PHI. BasicBlock *NewBlock = BasicBlock::Create( SI->getContext(), Twine(SI->getName(), ".si.unfold.false"), @@ -223,32 +223,44 @@ void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold, NewBlock->getFirstInsertionPt()); NewPhi->addIncoming(SIOp2, StartBlock); - if (auto *OpSi = dyn_cast(SIOp1)) - NewSIsToUnfold->push_back(SelectInstToUnfold(OpSi, SIUse)); - if (auto *OpSi = dyn_cast(SIOp2)) - NewSIsToUnfold->push_back(SelectInstToUnfold(OpSi, NewPhi)); - - // Update the phi node of SI. - for (unsigned Idx = 0; Idx < SIUse->getNumIncomingValues(); ++Idx) { - if (SIUse->getIncomingBlock(Idx) == StartBlock) - SIUse->setIncomingValue(Idx, SIOp1); + // Update any other PHI nodes in EndBlock. + for (PHINode &Phi : EndBlock->phis()) { + if (SIUse == &Phi) + continue; + Phi.addIncoming(Phi.getIncomingValueForBlock(StartBlock), NewBlock); } - SIUse->addIncoming(NewPhi, NewBlock); - // Update any other PHI nodes in EndBlock. - for (auto II = EndBlock->begin(); PHINode *Phi = dyn_cast(II); - ++II) { - if (Phi != SIUse) - Phi->addIncoming(Phi->getIncomingValueForBlock(StartBlock), NewBlock); + // Update the phi node of SI, which is its only use. + if (EndBlock == SIUse->getParent()) { + SIUse->addIncoming(NewPhi, NewBlock); + SIUse->replaceUsesOfWith(SI, SIOp1); + } else { + PHINode *EndPhi = PHINode::Create(SIUse->getType(), pred_size(EndBlock), + Twine(SI->getName(), ".si.unfold.phi"), + EndBlock->getFirstInsertionPt()); + for (BasicBlock *Pred : predecessors(EndBlock)) { + if (Pred != StartBlock && Pred != NewBlock) + EndPhi->addIncoming(EndPhi, Pred); + } + + EndPhi->addIncoming(SIOp1, StartBlock); + EndPhi->addIncoming(NewPhi, NewBlock); + SIUse->replaceUsesOfWith(SI, EndPhi); + SIUse = EndPhi; } - StartBlockTerm->eraseFromParent(); + if (auto *OpSi = dyn_cast(SIOp1)) + NewSIsToUnfold->push_back(SelectInstToUnfold(OpSi, SIUse)); + if (auto *OpSi = dyn_cast(SIOp2)) + NewSIsToUnfold->push_back(SelectInstToUnfold(OpSi, NewPhi)); // Insert the real conditional branch based on the original condition. + StartBlockTerm->eraseFromParent(); BranchInst::Create(EndBlock, NewBlock, SI->getCondition(), StartBlock); DTU->applyUpdates({{DominatorTree::Insert, StartBlock, EndBlock}, {DominatorTree::Insert, StartBlock, NewBlock}}); } else { + BasicBlock *EndBlock = SIUse->getParent(); BasicBlock *NewBlockT = BasicBlock::Create( SI->getContext(), Twine(SI->getName(), ".si.unfold.true"), EndBlock->getParent(), EndBlock); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0566d80c1cc001..09e4d0fcd31f3c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -538,12 +538,6 @@ class InnerLoopVectorizer { /// A small list of PHINodes. using PhiVector = SmallVector; - /// A type for scalarized values in the new loop. Each value from the - /// original loop, when scalarized, is represented by UF x VF scalar values - /// in the new unrolled loop, where UF is the unroll factor and VF is the - /// vectorization factor. - using ScalarParts = SmallVector, 2>; - /// Set up the values of the IVs correctly when exiting the vector loop. void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, @@ -9469,7 +9463,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { return; } - // Generate scalar instances for all VF lanes of all UF parts. + // Generate scalar instances for all VF lanes. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); const unsigned EndLane = State.VF.getKnownMinValue(); for (unsigned Lane = 0; Lane < EndLane; ++Lane) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 7e3dbe6260983e..b79e964cdb1b6b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9986,8 +9986,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } Cost += ::getShuffleCost( TTI, TTI::SK_InsertSubvector, - FixedVectorType::get(ScalarTy, CommonMask.size()), {}, CostKind, - Idx, FixedVectorType::get(ScalarTy, E->getVectorFactor())); + getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx, + getWidenedType(ScalarTy, E->getVectorFactor())); if (!CommonMask.empty()) { std::iota(std::next(CommonMask.begin(), Idx), std::next(CommonMask.begin(), Idx + E->getVectorFactor()), diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index ce15b2783cc457..5e4d487261c6f0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -333,10 +333,10 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) { // However, if we are vectorizing, we need to construct the vector values. // If the value is known to be uniform after vectorization, we can just - // broadcast the scalar value corresponding to lane zero for each unroll - // iteration. Otherwise, we construct the vector values using - // insertelement instructions. Since the resulting vectors are stored in - // State, we will only generate the insertelements once. + // broadcast the scalar value corresponding to lane zero. Otherwise, we + // construct the vector values using insertelement instructions. Since the + // resulting vectors are stored in State, we will only generate the + // insertelements once. Value *VectorValue = nullptr; if (IsUniform) { VectorValue = GetBroadcastInstrs(ScalarValue); @@ -769,15 +769,15 @@ void VPRegionBlock::execute(VPTransformState *State) { // Enter replicating mode. State->Instance = VPIteration(0, 0); - assert(!State->VF.isScalable() && "VF is assumed to be non scalable."); - for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; - ++Lane) { - State->Instance->Lane = VPLane(Lane, VPLane::Kind::First); - // Visit the VPBlocks connected to \p this, starting from it. - for (VPBlockBase *Block : RPOT) { - LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); - Block->execute(State); - } + assert(!State->VF.isScalable() && "VF is assumed to be non scalable."); + for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; + ++Lane) { + State->Instance->Lane = VPLane(Lane, VPLane::Kind::First); + // Visit the VPBlocks connected to \p this, starting from it. + for (VPBlockBase *Block : RPOT) { + LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); + Block->execute(State); + } } // Exit replicating mode. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 0632495bc511cd..c886a39aec76e5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -254,7 +254,7 @@ struct VPTransformState { DominatorTree *DT, IRBuilderBase &Builder, InnerLoopVectorizer *ILV, VPlan *Plan); - /// The chosen Vectorization and Unroll Factors of the loop being vectorized. + /// The chosen Vectorization Factor of the loop being vectorized. ElementCount VF; /// Hold the indices to generate specific scalar instructions. Null indicates @@ -1253,9 +1253,7 @@ class VPInstruction : public VPRecipeWithIRFlags, ComputeReductionResult, // Takes the VPValue to extract from as first operand and the lane or part // to extract as second operand, counting from the end starting with 1 for - // last. The second operand must be a positive constant and <= VF when - // extracting from a vector or <= UF when extracting from an unrolled - // scalar. + // last. The second operand must be a positive constant and <= VF. ExtractFromEnd, LogicalAnd, // Non-poison propagating logical And. // Add an offset in bytes (second operand) to a base pointer (first diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 318d6a8c5b8c34..f33293e65010f9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2490,9 +2490,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, // rather than directly getting the pointer for lane VF - 1, because the - // pointer operand of the interleaved access is supposed to be uniform. For - // uniform instructions, we're only required to generate a value for the - // first vector lane in each unroll iteration. + // pointer operand of the interleaved access is supposed to be uniform. if (Group->isReverse()) { Value *RuntimeVF = getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF); diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll index 13994c46335dee..9eadcaca6bb55e 100644 --- a/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll @@ -390,3 +390,28 @@ define void @select() { ret void } + +define void @select_of_constants() { +; CHECK-LABEL: 'select_of_constants' +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %1 = select i1 undef, <2 x i64> , <2 x i64> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = select i1 undef, <2 x i64> , <2 x i64> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %3 = select i1 undef, <2 x i64> , <2 x i64> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %4 = select i1 undef, <2 x i64> , <2 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = select <4 x i1> undef, <4 x i32> , <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + ; Splat constants + select i1 undef, <2 x i64> , <2 x i64> zeroinitializer + ; LHS is a VID patern + select i1 undef, <2 x i64> , <2 x i64> zeroinitializer + select i1 undef, <2 x i64> , <2 x i64> zeroinitializer + ; 2x general (expensive) constants + select i1 undef, <2 x i64> , <2 x i64> + + ; powers of two (still expensive) + select <4 x i1> undef, <4 x i32> , <4 x i32> zeroinitializer + + ret void +} + + diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll index 43ac246055da7b..584c0ef7cfeb78 100644 --- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll +++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll @@ -35,6 +35,15 @@ declare i32 @llvm.nvvm.rotate.b32(i32, i32) declare i64 @llvm.nvvm.rotate.right.b64(i64, i32) declare i64 @llvm.nvvm.rotate.b64(i64, i32) +declare ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr) +declare ptr addrspace(3) @llvm.nvvm.ptr.gen.to.shared.p3.p0(ptr) +declare ptr addrspace(4) @llvm.nvvm.ptr.gen.to.constant.p4.p0(ptr) +declare ptr addrspace(5) @llvm.nvvm.ptr.gen.to.local.p5.p0(ptr) +declare ptr @llvm.nvvm.ptr.global.to.gen.p0.p1(ptr addrspace(1)) +declare ptr @llvm.nvvm.ptr.shared.to.gen.p0.p3(ptr addrspace(3)) +declare ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4)) +declare ptr @llvm.nvvm.ptr.local.to.gen.p0.p5(ptr addrspace(5)) + ; CHECK-LABEL: @simple_upgrade define void @simple_upgrade(i32 %a, i64 %b, i16 %c) { ; CHECK: call i32 @llvm.bitreverse.i32(i32 %a) @@ -156,3 +165,29 @@ define void @rotate(i32 %a, i64 %b) { %r3 = call i64 @llvm.nvvm.rotate.b64(i64 %b, i32 8) ret void } + +; CHECK-LABEL: @addrspacecast +define void @addrspacecast(ptr %p0) { +; CHECK: %1 = addrspacecast ptr %p0 to ptr addrspace(1) +; CHECK: %2 = addrspacecast ptr addrspace(1) %1 to ptr +; CHECK: %3 = addrspacecast ptr %2 to ptr addrspace(3) +; CHECK: %4 = addrspacecast ptr addrspace(3) %3 to ptr +; CHECK: %5 = addrspacecast ptr %4 to ptr addrspace(4) +; CHECK: %6 = addrspacecast ptr addrspace(4) %5 to ptr +; CHECK: %7 = addrspacecast ptr %6 to ptr addrspace(5) +; CHECK: %8 = addrspacecast ptr addrspace(5) %7 to ptr +; + %p1 = call ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr %p0) + %p2 = call ptr @llvm.nvvm.ptr.global.to.gen.p0.p1(ptr addrspace(1) %p1) + + %p3 = call ptr addrspace(3) @llvm.nvvm.ptr.gen.to.shared.p3.p0(ptr %p2) + %p4 = call ptr @llvm.nvvm.ptr.shared.to.gen.p0.p3(ptr addrspace(3) %p3) + + %p5 = call ptr addrspace(4) @llvm.nvvm.ptr.gen.to.constant.p4.p0(ptr %p4) + %p6 = call ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4) %p5) + + %p7 = call ptr addrspace(5) @llvm.nvvm.ptr.gen.to.local.p5.p0(ptr %p6) + %p8 = call ptr @llvm.nvvm.ptr.local.to.gen.p0.p5(ptr addrspace(5) %p7) + + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll new file mode 100644 index 00000000000000..8ef8b5d13b62d4 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll @@ -0,0 +1,255 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s + +; This file tests eliding stack slots when lowering the FSINCOS ISD node. + +define { float, float } @sincos_f32_value_return(float %x) { +; CHECK-LABEL: sincos_f32_value_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldp s1, s0, [sp, #8] +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + %ret_0 = insertvalue { float, float } poison, float %sin, 0 + %ret_1 = insertvalue { float, float } %ret_0, float %cos, 1 + ret { float, float } %ret_1 +} + +define void @sincos_f32_ptr_return(float %x, ptr noalias %out_sin, ptr noalias %out_cos) { +; CHECK-LABEL: sincos_f32_ptr_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + store float %sin, ptr %out_sin, align 4 + store float %cos, ptr %out_cos, align 4 + ret void +} + +define float @sincos_f32_mixed_return(float %x, ptr %out_sin) { +; CHECK-LABEL: sincos_f32_mixed_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: add x1, sp, #12 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + store float %sin, ptr %out_sin, align 4 + ret float %cos +} + +define { double, double } @sincos_f64_value_return(double %x) { +; CHECK-LABEL: sincos_f64_value_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: add x0, sp, #24 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: ldr d0, [sp, #24] +; CHECK-NEXT: ldr d1, [sp, #8] +; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +entry: + %sin = tail call double @llvm.sin.f64(double %x) + %cos = tail call double @llvm.cos.f64(double %x) + %ret_0 = insertvalue { double, double } poison, double %sin, 0 + %ret_1 = insertvalue { double, double } %ret_0, double %cos, 1 + ret { double, double } %ret_1 +} + +define void @sincos_f64_ptr_return(double %x, ptr noalias %out_sin, ptr noalias %out_cos) { +; CHECK-LABEL: sincos_f64_ptr_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call double @llvm.sin.f64(double %x) + %cos = tail call double @llvm.cos.f64(double %x) + store double %sin, ptr %out_sin, align 8 + store double %cos, ptr %out_cos, align 8 + ret void +} + +define double @sincos_f64_mixed_return(double %x, ptr %out_sin) { +; CHECK-LABEL: sincos_f64_mixed_return: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call double @llvm.sin.f64(double %x) + %cos = tail call double @llvm.cos.f64(double %x) + store double %sin, ptr %out_sin, align 8 + ret double %cos +} + +; Here %out_sin and %out_cos may alias so we can't replace both stores with the +; call to sincosf (as the order of stores in sincosf is not defined). +define void @sincos_may_alias(float %x, ptr %out_sin, ptr %out_cos) { +; CHECK-LABEL: sincos_may_alias: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: add x1, sp, #12 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: str s0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + store float %sin, ptr %out_sin, align 4 + store float %cos, ptr %out_cos, align 4 + ret void +} + +; Here %out is used for both sin and cos (with the final value stored being cos). +define float @sincos_multiple_uses(float %x, ptr %out) { +; CHECK-LABEL: sincos_multiple_uses: +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %sin = call float @llvm.sin.f32(float %x) + store float %sin, ptr %out, align 4 + %reload = load float, ptr %out, align 4 + %cos = call float @llvm.cos.f32(float %x) + store float %cos, ptr %out, align 4 + ret float %reload +} + +; Negative test. We can't fold volatile stores into the library call. +define void @sincos_volatile_result_stores(float %x, ptr noalias %out_sin, ptr noalias %out_cos) { +; CHECK-LABEL: sincos_volatile_result_stores: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldp s1, s0, [sp, #8] +; CHECK-NEXT: str s0, [x20] +; CHECK-NEXT: str s1, [x19] +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + store volatile float %sin, ptr %out_sin, align 4 + store volatile float %cos, ptr %out_cos, align 4 + ret void +} + +; Negative test. We can't fold atomic stores into the library call. +define void @sincos_atomic_result_stores(float %x, ptr noalias %out_sin, ptr noalias %out_cos) { +; CHECK-LABEL: sincos_atomic_result_stores: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincosf +; CHECK-NEXT: ldr w8, [sp, #12] +; CHECK-NEXT: str w8, [x20] +; CHECK-NEXT: ldr w8, [sp, #8] +; CHECK-NEXT: str w8, [x19] +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %sin = tail call float @llvm.sin.f32(float %x) + %cos = tail call float @llvm.cos.f32(float %x) + store atomic float %sin, ptr %out_sin unordered, align 4 + store atomic float %cos, ptr %out_cos unordered, align 4 + ret void +} + +; Negative test. We can't fold misaligned stores into the library call. +define void @sincos_misaligned_result_stores(double %x, ptr noalias %out_sin, ptr noalias %out_cos) { +; CHECK-LABEL: sincos_misaligned_result_stores: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: add x0, sp, #24 +; CHECK-NEXT: add x1, sp, #8 +; CHECK-NEXT: bl sincos +; CHECK-NEXT: ldr d0, [sp, #24] +; CHECK-NEXT: ldr d1, [sp, #8] +; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: str d0, [x20] +; CHECK-NEXT: str d1, [x19] +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ret +entry: + %sin = tail call double @llvm.sin.f64(double %x) + %cos = tail call double @llvm.cos.f64(double %x) + store double %sin, ptr %out_sin, align 4 + store double %cos, ptr %out_cos, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll index ac19bd59babe46..803bb9fda458b9 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll @@ -45,6 +45,51 @@ define void @test_no_stackslot_scavenging(float %f) #0 { ret void } +define void @test_no_stackslot_scavenging_with_fp(float %f, i64 %n) #0 "frame-pointer"="all" { +; CHECK-LABEL: test_no_stackslot_scavenging_with_fp: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-128]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x28, x25, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: stp x24, x19, [sp, #112] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: lsl x9, x0, #3 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: str s0, [x29, #28] // 4-byte Folded Spill +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr s0, [x29, #28] // 4-byte Folded Reload +; CHECK-NEXT: bl use_f +; CHECK-NEXT: smstart sm +; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x24, x19, [sp, #112] // 16-byte Folded Reload +; CHECK-NEXT: ldp x28, x25, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #128 // 16-byte Folded Reload +; CHECK-NEXT: ret + %ptr2 = alloca i64, i64 %n, align 8 + %ptr = alloca + call void asm sideeffect "", "~{x24},~{x25}"() nounwind + call void @use_f(float %f) + ret void +} + declare void @use_f(float) +declare void @use_f_and_ptr(float, ptr) attributes #0 = { nounwind "target-features"="+sve,+sme" "aarch64_pstate_sm_enabled" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll index 6e029f7c0a95e5..4fb28b392c9ea9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll @@ -1,43 +1,43 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GLOBAL-ISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { -; GCN-LABEL: test1_s_barrier_signal: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b32 v3, v2, s[0:1] -; GCN-NEXT: s_wait_storecnt 0x0 -; GCN-NEXT: s_barrier_signal -1 -; GCN-NEXT: s_barrier_wait -1 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test1_s_barrier_signal: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal -1 +; GFX12-SDAG-NEXT: s_barrier_wait -1 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test1_s_barrier_signal: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 -; GLOBAL-ISEL-NEXT: s_barrier_signal -1 -; GLOBAL-ISEL-NEXT: s_barrier_wait -1 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-GISEL-LABEL: test1_s_barrier_signal: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal -1 +; GFX12-GISEL-NEXT: s_barrier_wait -1 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -51,41 +51,41 @@ entry: } define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { -; GCN-LABEL: test2_s_barrier_signal: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b32 v3, v2, s[0:1] -; GCN-NEXT: s_wait_storecnt 0x0 -; GCN-NEXT: s_barrier_signal 1 -; GCN-NEXT: s_barrier_wait 1 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test2_s_barrier_signal: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal 1 +; GFX12-SDAG-NEXT: s_barrier_wait 1 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test2_s_barrier_signal: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 -; GLOBAL-ISEL-NEXT: s_barrier_signal 1 -; GLOBAL-ISEL-NEXT: s_barrier_wait 1 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-GISEL-LABEL: test2_s_barrier_signal: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal 1 +; GFX12-GISEL-NEXT: s_barrier_wait 1 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -99,41 +99,41 @@ entry: } define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { -; GCN-LABEL: test3_s_barrier_signal: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b32 v3, v2, s[0:1] -; GCN-NEXT: s_wait_storecnt 0x0 -; GCN-NEXT: s_barrier_signal 0 -; GCN-NEXT: s_barrier_wait 0 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test3_s_barrier_signal: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal 0 +; GFX12-SDAG-NEXT: s_barrier_wait 0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test3_s_barrier_signal: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 -; GLOBAL-ISEL-NEXT: s_barrier_signal 0 -; GLOBAL-ISEL-NEXT: s_barrier_wait 0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-GISEL-LABEL: test3_s_barrier_signal: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal 0 +; GFX12-GISEL-NEXT: s_barrier_wait 0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -147,44 +147,44 @@ entry: } define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 { -; GCN-LABEL: test1_s_barrier_signal_var: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_mov_b32 m0, 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b32 v3, v1, s[0:1] -; GCN-NEXT: s_wait_storecnt 0x0 -; GCN-NEXT: s_barrier_signal m0 -; GCN-NEXT: s_barrier_wait 1 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test1_s_barrier_signal_var: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v2, v0, v0 +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal m0 +; GFX12-SDAG-NEXT: s_barrier_wait 1 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_var: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v2, 0 -; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 -; GLOBAL-ISEL-NEXT: s_barrier_signal m0 -; GLOBAL-ISEL-NEXT: s_barrier_wait 1 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-GISEL-LABEL: test1_s_barrier_signal_var: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_mov_b32 m0, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal m0 +; GFX12-GISEL-NEXT: s_barrier_wait 1 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -198,83 +198,83 @@ entry: } define void @test2_s_barrier_signal_var(i32 %arg) { -; GCN-LABEL: test2_s_barrier_signal_var: -; GCN: ; %bb.0: -; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-NEXT: s_wait_expcnt 0x0 -; GCN-NEXT: s_wait_samplecnt 0x0 -; GCN-NEXT: s_wait_bvhcnt 0x0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_wait_storecnt 0x0 -; GCN-NEXT: s_barrier_signal m0 -; GCN-NEXT: s_wait_alu 0xfffe -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX12-SDAG-LABEL: test2_s_barrier_signal_var: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal m0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_var: -; GLOBAL-ISEL: ; %bb.0: -; GLOBAL-ISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_expcnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_samplecnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_bvhcnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0 -; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 -; GLOBAL-ISEL-NEXT: s_barrier_signal m0 -; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] +; GFX12-GISEL-LABEL: test2_s_barrier_signal_var: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal m0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] call void @llvm.amdgcn.s.barrier.signal.var(i32 %arg) ret void } define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { -; GCN-LABEL: test1_s_barrier_signal_isfirst: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b32 v0, v1, s[6:7] -; GCN-NEXT: s_wait_storecnt 0x0 -; GCN-NEXT: s_barrier_signal_isfirst -1 -; GCN-NEXT: s_cselect_b32 s3, s3, s5 -; GCN-NEXT: s_cselect_b32 s2, s2, s4 -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: global_load_b32 v2, v1, s[0:1] -; GCN-NEXT: global_load_b32 v1, v1, s[2:3] -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 -; GCN-NEXT: global_store_b32 v0, v1, s[6:7] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test1_s_barrier_signal_isfirst: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1 +; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s5 +; GFX12-SDAG-NEXT: s_cselect_b32 s2, s2, s4 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_load_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 -; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst -1 -; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1 -; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0 -; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GLOBAL-ISEL-NEXT: s_clause 0x1 -; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1] -; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3] -; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-GISEL-LABEL: test1_s_barrier_signal_isfirst: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1 +; GFX12-GISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_and_b32 s8, s8, 1 +; GFX12-GISEL-NEXT: s_cmp_lg_u32 s8, 0 +; GFX12-GISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_load_b32 v2, v1, s[0:1] +; GFX12-GISEL-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -289,52 +289,52 @@ entry: } define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { -; GCN-LABEL: test2_s_barrier_signal_isfirst: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b32 v0, v1, s[6:7] -; GCN-NEXT: s_wait_storecnt 0x0 -; GCN-NEXT: s_barrier_signal_isfirst 1 -; GCN-NEXT: s_cselect_b32 s3, s3, s5 -; GCN-NEXT: s_cselect_b32 s2, s2, s4 -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: global_load_b32 v2, v1, s[0:1] -; GCN-NEXT: global_load_b32 v1, v1, s[2:3] -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 -; GCN-NEXT: global_store_b32 v0, v1, s[6:7] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test2_s_barrier_signal_isfirst: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal_isfirst 1 +; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s5 +; GFX12-SDAG-NEXT: s_cselect_b32 s2, s2, s4 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_load_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 -; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1 -; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1 -; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0 -; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GLOBAL-ISEL-NEXT: s_clause 0x1 -; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1] -; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3] -; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-GISEL-LABEL: test2_s_barrier_signal_isfirst: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal_isfirst 1 +; GFX12-GISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_and_b32 s8, s8, 1 +; GFX12-GISEL-NEXT: s_cmp_lg_u32 s8, 0 +; GFX12-GISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_load_b32 v2, v1, s[0:1] +; GFX12-GISEL-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -349,52 +349,52 @@ entry: } define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { -; GCN-LABEL: test3_s_barrier_signal_isfirst: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b32 v0, v1, s[6:7] -; GCN-NEXT: s_wait_storecnt 0x0 -; GCN-NEXT: s_barrier_signal_isfirst 1 -; GCN-NEXT: s_cselect_b32 s3, s3, s5 -; GCN-NEXT: s_cselect_b32 s2, s2, s4 -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: global_load_b32 v2, v1, s[0:1] -; GCN-NEXT: global_load_b32 v1, v1, s[2:3] -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 -; GCN-NEXT: global_store_b32 v0, v1, s[6:7] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test3_s_barrier_signal_isfirst: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal_isfirst 1 +; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s5 +; GFX12-SDAG-NEXT: s_cselect_b32 s2, s2, s4 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_load_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test3_s_barrier_signal_isfirst: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 -; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1 -; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1 -; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0 -; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GLOBAL-ISEL-NEXT: s_clause 0x1 -; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1] -; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3] -; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-GISEL-LABEL: test3_s_barrier_signal_isfirst: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal_isfirst 1 +; GFX12-GISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_and_b32 s8, s8, 1 +; GFX12-GISEL-NEXT: s_cmp_lg_u32 s8, 0 +; GFX12-GISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_load_b32 v2, v1, s[0:1] +; GFX12-GISEL-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -409,54 +409,54 @@ entry: } define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { -; GCN-LABEL: test1_s_barrier_signal_isfirst_var: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_mov_b32 m0, 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b32 v0, v1, s[6:7] -; GCN-NEXT: s_wait_storecnt 0x0 -; GCN-NEXT: s_barrier_signal_isfirst m0 -; GCN-NEXT: s_cselect_b32 s3, s3, s5 -; GCN-NEXT: s_cselect_b32 s2, s2, s4 -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: global_load_b32 v2, v1, s[0:1] -; GCN-NEXT: global_load_b32 v1, v1, s[2:3] -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 -; GCN-NEXT: global_store_b32 v0, v1, s[6:7] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test1_s_barrier_signal_isfirst_var: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal_isfirst m0 +; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s5 +; GFX12-SDAG-NEXT: s_cselect_b32 s2, s2, s4 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_load_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst_var: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 -; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0 -; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1 -; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0 -; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GLOBAL-ISEL-NEXT: s_clause 0x1 -; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1] -; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3] -; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-GISEL-LABEL: test1_s_barrier_signal_isfirst_var: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_mov_b32 m0, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal_isfirst m0 +; GFX12-GISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_and_b32 s8, s8, 1 +; GFX12-GISEL-NEXT: s_cmp_lg_u32 s8, 0 +; GFX12-GISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_load_b32 v2, v1, s[0:1] +; GFX12-GISEL-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -471,65 +471,65 @@ entry: } define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, i32 %arg, ptr addrspace(1) %out) { -; GCN-LABEL: test2_s_barrier_signal_isfirst_var: -; GCN: ; %bb.0: -; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-NEXT: s_wait_expcnt 0x0 -; GCN-NEXT: s_wait_samplecnt 0x0 -; GCN-NEXT: s_wait_bvhcnt 0x0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v31 -; GCN-NEXT: v_readfirstlane_b32 s0, v6 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 2, v9 -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_add_co_u32 v7, vcc_lo, v7, v9 -; GCN-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo -; GCN-NEXT: global_store_b32 v[7:8], v10, off -; GCN-NEXT: s_wait_storecnt 0x0 -; GCN-NEXT: s_barrier_signal_isfirst m0 -; GCN-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GCN-NEXT: s_wait_alu 0xfffe -; GCN-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3 -; GCN-NEXT: global_load_b32 v0, v[0:1], off -; GCN-NEXT: global_load_b32 v1, v[2:3], off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 -; GCN-NEXT: global_store_b32 v[7:8], v0, off -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX12-SDAG-LABEL: test2_s_barrier_signal_isfirst_var: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v31 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v9, 2, v9 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_u32 v7, vcc_lo, v7, v9 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo +; GFX12-SDAG-NEXT: global_store_b32 v[7:8], v10, off +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal_isfirst m0 +; GFX12-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-SDAG-NEXT: global_load_b32 v1, v[2:3], off +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX12-SDAG-NEXT: global_store_b32 v[7:8], v0, off +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst_var: -; GLOBAL-ISEL: ; %bb.0: -; GLOBAL-ISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_expcnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_samplecnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_bvhcnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v9, 0x3ff, v31 -; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v6 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v9, 2, v9 -; GLOBAL-ISEL-NEXT: v_add_co_u32 v7, vcc_lo, v7, v9 -; GLOBAL-ISEL-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo -; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v9, 0 -; GLOBAL-ISEL-NEXT: global_store_b32 v[7:8], v9, off -; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 -; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0 -; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0 -; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe -; GLOBAL-ISEL-NEXT: s_and_b32 s0, 1, s0 -; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe -; GLOBAL-ISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GLOBAL-ISEL-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3 -; GLOBAL-ISEL-NEXT: global_load_b32 v0, v[0:1], off -; GLOBAL-ISEL-NEXT: global_load_b32 v1, v[2:3], off -; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: global_store_b32 v[7:8], v0, off -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] +; GFX12-GISEL-LABEL: test2_s_barrier_signal_isfirst_var: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_and_b32_e32 v9, 0x3ff, v31 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v6 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v9, 2, v9 +; GFX12-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v7, v9 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo +; GFX12-GISEL-NEXT: v_mov_b32_e32 v9, 0 +; GFX12-GISEL-NEXT: global_store_b32 v[7:8], v9, off +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal_isfirst m0 +; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_and_b32 s0, 1, s0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-GISEL-NEXT: global_load_b32 v1, v[2:3], off +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX12-GISEL-NEXT: global_store_b32 v[7:8], v0, off +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp store i32 0, ptr addrspace(1) %tmp1 @@ -543,40 +543,40 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa } define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { -; GCN-LABEL: test1_s_barrier_init: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: global_store_b32 v3, v2, s[0:1] -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: s_barrier_init -1 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test1_s_barrier_init: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_lshl_b32 s2, s2, 16 +; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 +; GFX12-SDAG-NEXT: s_barrier_init -1 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test1_s_barrier_init: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_barrier_init -1 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-GISEL-LABEL: test1_s_barrier_init: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_lshl_b32 m0, 16, s2 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_barrier_init -1 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -589,40 +589,40 @@ entry: } define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { -; GCN-LABEL: test2_s_barrier_init: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: global_store_b32 v3, v2, s[0:1] -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: s_barrier_init 1 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test2_s_barrier_init: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_lshl_b32 s2, s2, 16 +; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 +; GFX12-SDAG-NEXT: s_barrier_init 1 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test2_s_barrier_init: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_barrier_init 1 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-GISEL-LABEL: test2_s_barrier_init: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_lshl_b32 m0, 16, s2 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_barrier_init 1 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -635,40 +635,40 @@ entry: } define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { -; GCN-LABEL: test3_s_barrier_init: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: global_store_b32 v3, v2, s[0:1] -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: s_barrier_init 0 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test3_s_barrier_init: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_lshl_b32 s2, s2, 16 +; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 +; GFX12-SDAG-NEXT: s_barrier_init 0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test3_s_barrier_init: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_barrier_init 0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-GISEL-LABEL: test3_s_barrier_init: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_lshl_b32 m0, 16, s2 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_barrier_init 0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -681,43 +681,43 @@ entry: } define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, i32 %mbrCnt) #0 { -; GCN-LABEL: test4_s_barrier_init: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_lshl_b32 s3, s3, 16 -; GCN-NEXT: global_store_b32 v3, v2, s[0:1] -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: s_barrier_init m0 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test4_s_barrier_init: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_lshl_b32 s3, s3, 16 +; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-SDAG-NEXT: s_or_b32 s2, s2, s3 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 +; GFX12-SDAG-NEXT: s_barrier_init m0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test4_s_barrier_init: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_lshl_b32 s3, 16, s3 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_or_b32 m0, s2, s3 -; GLOBAL-ISEL-NEXT: s_barrier_init m0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-GISEL-LABEL: test4_s_barrier_init: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_lshl_b32 s3, 16, s3 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_or_b32 m0, s2, s3 +; GFX12-GISEL-NEXT: s_barrier_init m0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -730,74 +730,76 @@ entry: } define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) { -; GCN-LABEL: test5_s_barrier_init_m0: -; GCN: ; %bb.0: -; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-NEXT: s_wait_expcnt 0x0 -; GCN-NEXT: s_wait_samplecnt 0x0 -; GCN-NEXT: s_wait_bvhcnt 0x0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_barrier_init m0 -; GCN-NEXT: s_wait_alu 0xfffe -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX12-SDAG-LABEL: test5_s_barrier_init_m0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX12-SDAG-NEXT: s_barrier_init m0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GLOBAL-ISEL-LABEL: test5_s_barrier_init_m0: -; GLOBAL-ISEL: ; %bb.0: -; GLOBAL-ISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_expcnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_samplecnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_bvhcnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 s0, v1 -; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 s1, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: s_lshl_b32 s0, 16, s0 -; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe -; GLOBAL-ISEL-NEXT: s_or_b32 m0, s1, s0 -; GLOBAL-ISEL-NEXT: s_barrier_init m0 -; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe -; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] +; GFX12-GISEL-LABEL: test5_s_barrier_init_m0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_lshl_b32 s0, 16, s0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_or_b32 m0, s1, s0 +; GFX12-GISEL-NEXT: s_barrier_init m0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] call void @llvm.amdgcn.s.barrier.init(i32 %arg1, i32 %arg2) ret void } define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { -; GCN-LABEL: test1_s_barrier_join: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_barrier_join -1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b32 v2, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test1_s_barrier_join: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_barrier_join -1 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test1_s_barrier_join: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-SDAG-NEXT: s_barrier_join -1 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: test1_s_barrier_join: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_barrier_join -1 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -810,36 +812,38 @@ entry: } define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { -; GCN-LABEL: test2_s_barrier_join: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_barrier_join 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b32 v2, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test2_s_barrier_join: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_barrier_join 1 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test2_s_barrier_join: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-SDAG-NEXT: s_barrier_join 1 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: test2_s_barrier_join: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_barrier_join 1 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -852,36 +856,38 @@ entry: } define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { -; GCN-LABEL: test3_s_barrier_join: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_barrier_join 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b32 v2, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test3_s_barrier_join: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_barrier_join 0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test3_s_barrier_join: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-SDAG-NEXT: s_barrier_join 0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: test3_s_barrier_join: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_barrier_join 0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -894,39 +900,39 @@ entry: } define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %bar) #0 { -; GCN-LABEL: test4_s_barrier_join_m0: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: global_store_b32 v3, v1, s[0:1] -; GCN-NEXT: s_barrier_join m0 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test4_s_barrier_join_m0: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v2, v0, v0 +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 +; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] +; GFX12-SDAG-NEXT: s_barrier_join m0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test4_s_barrier_join_m0: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_barrier_join m0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-GISEL-LABEL: test4_s_barrier_join_m0: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_barrier_join m0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -939,79 +945,93 @@ entry: } define void @test5_s_barrier_join_m0(i32 %arg) { -; GCN-LABEL: test5_s_barrier_join_m0: -; GCN: ; %bb.0: -; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-NEXT: s_wait_expcnt 0x0 -; GCN-NEXT: s_wait_samplecnt 0x0 -; GCN-NEXT: s_wait_bvhcnt 0x0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_barrier_join m0 -; GCN-NEXT: s_wait_alu 0xfffe -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX12-SDAG-LABEL: test5_s_barrier_join_m0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX12-SDAG-NEXT: s_barrier_join m0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GLOBAL-ISEL-LABEL: test5_s_barrier_join_m0: -; GLOBAL-ISEL: ; %bb.0: -; GLOBAL-ISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_expcnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_samplecnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_bvhcnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0 -; GLOBAL-ISEL-NEXT: s_barrier_join m0 -; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] +; GFX12-GISEL-LABEL: test5_s_barrier_join_m0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GFX12-GISEL-NEXT: s_barrier_join m0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] call void @llvm.amdgcn.s.barrier.join(i32 %arg) ret void } +define void @test6_s_barrier_join_0() { +; GFX12-LABEL: test6_s_barrier_join_0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_barrier_join 0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.s.barrier.join(i32 0) + ret void +} + define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { -; GCN-LABEL: test1_s_barrier_leave: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b32 v0, v1, s[6:7] -; GCN-NEXT: s_barrier_leave -; GCN-NEXT: s_cselect_b32 s3, s3, s5 -; GCN-NEXT: s_cselect_b32 s2, s2, s4 -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: global_load_b32 v2, v1, s[0:1] -; GCN-NEXT: global_load_b32 v1, v1, s[2:3] -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 -; GCN-NEXT: global_store_b32 v0, v1, s[6:7] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test1_s_barrier_leave: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-SDAG-NEXT: s_barrier_leave +; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s5 +; GFX12-SDAG-NEXT: s_cselect_b32 s2, s2, s4 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: global_load_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test1_s_barrier_leave: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GLOBAL-ISEL-NEXT: s_barrier_leave -; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 -; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0 -; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GLOBAL-ISEL-NEXT: s_clause 0x1 -; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1] -; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3] -; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-GISEL-LABEL: test1_s_barrier_leave: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-GISEL-NEXT: s_barrier_leave +; GFX12-GISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GFX12-GISEL-NEXT: s_and_b32 s8, s8, 1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_cmp_lg_u32 s8, 0 +; GFX12-GISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: global_load_b32 v2, v1, s[0:1] +; GFX12-GISEL-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -1026,36 +1046,38 @@ entry: } define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { -; GCN-LABEL: test1_s_wakeup_barrier: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_wakeup_barrier -1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b32 v2, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test1_s_wakeup_barrier: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_wakeup_barrier -1 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test1_s_wakeup_barrier: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-SDAG-NEXT: s_wakeup_barrier -1 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: test1_s_wakeup_barrier: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_wakeup_barrier -1 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -1068,36 +1090,38 @@ entry: } define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { -; GCN-LABEL: test2_s_wakeup_barrier: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_wakeup_barrier 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b32 v2, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test2_s_wakeup_barrier: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_wakeup_barrier 1 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test2_s_wakeup_barrier: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-SDAG-NEXT: s_wakeup_barrier 1 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: test2_s_wakeup_barrier: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_wakeup_barrier 1 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -1110,36 +1134,38 @@ entry: } define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { -; GCN-LABEL: test3_s_wakeup_barrier: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_wakeup_barrier 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b32 v2, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test3_s_wakeup_barrier: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_wakeup_barrier 0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test3_s_wakeup_barrier: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-SDAG-NEXT: s_wakeup_barrier 0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: test3_s_wakeup_barrier: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_wakeup_barrier 0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -1152,39 +1178,39 @@ entry: } define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 %bar) #0 { -; GCN-LABEL: test4_s_wakeup_barrier_m0: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: global_store_b32 v3, v1, s[0:1] -; GCN-NEXT: s_wakeup_barrier m0 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test4_s_wakeup_barrier_m0: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v2, v0, v0 +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 +; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] +; GFX12-SDAG-NEXT: s_wakeup_barrier m0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test4_s_wakeup_barrier_m0: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_wakeup_barrier m0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-GISEL-LABEL: test4_s_wakeup_barrier_m0: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_mov_b32 m0, s2 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_wakeup_barrier m0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -1197,63 +1223,50 @@ entry: } define void @test5_s_wakeup_barrier_m0(i32 %arg) { -; GCN-LABEL: test5_s_wakeup_barrier_m0: -; GCN: ; %bb.0: -; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-NEXT: s_wait_expcnt 0x0 -; GCN-NEXT: s_wait_samplecnt 0x0 -; GCN-NEXT: s_wait_bvhcnt 0x0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_wakeup_barrier m0 -; GCN-NEXT: s_wait_alu 0xfffe -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX12-SDAG-LABEL: test5_s_wakeup_barrier_m0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX12-SDAG-NEXT: s_wakeup_barrier m0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GLOBAL-ISEL-LABEL: test5_s_wakeup_barrier_m0: -; GLOBAL-ISEL: ; %bb.0: -; GLOBAL-ISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_expcnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_samplecnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_bvhcnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0 -; GLOBAL-ISEL-NEXT: s_wakeup_barrier m0 -; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] +; GFX12-GISEL-LABEL: test5_s_wakeup_barrier_m0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GFX12-GISEL-NEXT: s_wakeup_barrier m0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] call void @llvm.amdgcn.s.wakeup.barrier(i32 %arg) ret void } define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 { -; GCN-LABEL: test1_s_get_barrier_state: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s4, -1 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: global_store_b32 v0, v1, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm -; -; GLOBAL-ISEL-LABEL: test1_s_get_barrier_state: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, -1 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-LABEL: test1_s_get_barrier_state: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_get_barrier_state s2, -1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -1264,34 +1277,21 @@ entry: } define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 { -; GCN-LABEL: test2_s_get_barrier_state: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s4, 1 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: global_store_b32 v0, v1, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm -; -; GLOBAL-ISEL-LABEL: test2_s_get_barrier_state: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 1 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-LABEL: test2_s_get_barrier_state: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_get_barrier_state s2, 1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -1302,34 +1302,21 @@ entry: } define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 { -; GCN-LABEL: test3_s_get_barrier_state: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s4, 0 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: global_store_b32 v0, v1, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm -; -; GLOBAL-ISEL-LABEL: test3_s_get_barrier_state: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-LABEL: test3_s_get_barrier_state: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_get_barrier_state s2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -1340,41 +1327,23 @@ entry: } define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i32 %bar) #0 { -; GCN-LABEL: test4_s_get_barrier_state_m0: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: global_store_b32 v0, v1, s[0:1] -; GCN-NEXT: s_get_barrier_state s2, m0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_2) -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: global_store_b32 v0, v1, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm -; -; GLOBAL-ISEL-LABEL: test4_s_get_barrier_state_m0: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, m0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) -; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-LABEL: test4_s_get_barrier_state_m0: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 m0, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_get_barrier_state s2, m0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GFX12-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp @@ -1385,76 +1354,94 @@ entry: } define i32 @test5_s_get_barrier_state_m0(i32 %arg) { -; GCN-LABEL: test5_s_get_barrier_state_m0: -; GCN: ; %bb.0: -; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-NEXT: s_wait_expcnt 0x0 -; GCN-NEXT: s_wait_samplecnt 0x0 -; GCN-NEXT: s_wait_bvhcnt 0x0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_get_barrier_state s0, m0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_wait_alu 0xfffe -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX12-SDAG-LABEL: test5_s_get_barrier_state_m0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX12-SDAG-NEXT: s_get_barrier_state s0, m0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GLOBAL-ISEL-LABEL: test5_s_get_barrier_state_m0: -; GLOBAL-ISEL: ; %bb.0: -; GLOBAL-ISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_expcnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_samplecnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_bvhcnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0 -; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, m0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v0, s0 -; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] +; GFX12-GISEL-LABEL: test5_s_get_barrier_state_m0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GFX12-GISEL-NEXT: s_get_barrier_state s0, m0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 %arg) ret i32 %state } +define i32 @test6_s_get_barrier_state_0() { +; GFX12-LABEL: test6_s_get_barrier_state_0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_get_barrier_state s0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 0) + ret i32 %state +} + define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { -; GCN-LABEL: test_barrier_convert: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 -; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b32 v3, v2, s[0:1] -; GCN-NEXT: s_wait_storecnt 0x0 -; GCN-NEXT: s_barrier_signal -1 -; GCN-NEXT: s_barrier_wait -1 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GCN-NEXT: s_endpgm +; GFX12-SDAG-LABEL: test_barrier_convert: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_barrier_signal -1 +; GFX12-SDAG-NEXT: s_barrier_wait -1 +; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm ; -; GLOBAL-ISEL-LABEL: test_barrier_convert: -; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 -; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 -; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 -; GLOBAL-ISEL-NEXT: s_barrier_signal -1 -; GLOBAL-ISEL-NEXT: s_barrier_wait -1 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] -; GLOBAL-ISEL-NEXT: s_nop 0 -; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GLOBAL-ISEL-NEXT: s_endpgm +; GFX12-GISEL-LABEL: test_barrier_convert: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_barrier_signal -1 +; GFX12-GISEL-NEXT: s_barrier_wait -1 +; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp diff --git a/llvm/test/CodeGen/NVPTX/intrin-nocapture.ll b/llvm/test/CodeGen/NVPTX/intrin-nocapture.ll deleted file mode 100644 index 040bbde13800cd..00000000000000 --- a/llvm/test/CodeGen/NVPTX/intrin-nocapture.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: opt < %s -O3 -S | FileCheck %s - -; Address space intrinsics were erroneously marked NoCapture, leading to bad -; optimizations (such as the store below being eliminated as dead code). This -; test makes sure we don't regress. - -declare void @foo(ptr addrspace(1)) - -declare ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr) - -; CHECK: @bar -define void @bar() { - %t1 = alloca i32 -; CHECK: call ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr nonnull %t1) -; CHECK-NEXT: store i32 10, ptr %t1 - %t2 = call ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr %t1) - store i32 10, ptr %t1 - call void @foo(ptr addrspace(1) %t2) - ret void -} - diff --git a/llvm/test/CodeGen/SPARC/salvage-debug-isel.ll b/llvm/test/CodeGen/SPARC/salvage-debug-isel.ll new file mode 100644 index 00000000000000..ce44d3ab7fd082 --- /dev/null +++ b/llvm/test/CodeGen/SPARC/salvage-debug-isel.ll @@ -0,0 +1,69 @@ +; RUN: llc -march=sparc -O1 %s -o - -stop-after=finalize-isel | FileCheck %s + +; Debug info salvaging in isel means we should see a location for this variable. + +; CHECK-LABEL: name: a +; CHECK: DBG_VALUE %stack.0.b, $noreg, ![[#]], !DIExpression(DW_OP_plus_uconst, 3, DW_OP_stack_value) + +define dso_local zeroext i16 @a() local_unnamed_addr #0 !dbg !7 { +entry: + %b = alloca [6 x i8], align 1 + %arrayidx = getelementptr inbounds [6 x i8], ptr %b, i32 0, i32 undef, !dbg !27 + store i8 4, ptr %arrayidx, align 1, !dbg !28 + %arrayidx1 = getelementptr inbounds i8, ptr %b, i32 3, !dbg !32 + #dbg_value(ptr %arrayidx1, !22, !DIExpression(), !25) + %0 = load i8, ptr %arrayidx1, align 1, !dbg !33 + %tobool.not = icmp eq i8 %0, 0, !dbg !35 + br i1 %tobool.not, label %if.end, label %for.cond, !dbg !36 + +for.cond: ; preds = %entry, %for.cond + br label %for.cond, !dbg !37, !llvm.loop !40 + +if.end: ; preds = %entry + ret i16 undef, !dbg !44 +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git.prerel", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "file.c", directory: "/path", checksumkind: CSK_MD5, checksum: "aa7b5139660a2329a6409414c44cc1f6") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 7, !"debug-info-assignment-tracking", i1 true} +!6 = !{!"clang version 20.0.0git.prerel"} +!7 = distinct !DISubprogram(name: "a", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !13) +!8 = !DISubroutineType(types: !9) +!9 = !{!10} +!10 = !DIDerivedType(tag: DW_TAG_typedef, name: "uint16_t", file: !11, line: 277, baseType: !12) +!11 = !DIFile(filename: "stdint.h", directory: "", checksumkind: CSK_MD5, checksum: "d9e8f73f3756bbd642f1729623e09484") +!12 = !DIBasicType(name: "unsigned short", size: 16, encoding: DW_ATE_unsigned) +!13 = !{!14, !20, !22} +!14 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 3, type: !15) +!15 = !DICompositeType(tag: DW_TAG_array_type, baseType: !16, size: 48, elements: !18) +!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "int8_t", file: !11, line: 298, baseType: !17) +!17 = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char) +!18 = !{!19} +!19 = !DISubrange(count: 6) +!20 = !DILocalVariable(name: "c", scope: !7, file: !1, line: 4, type: !21) +!21 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!22 = !DILocalVariable(name: "d", scope: !7, file: !1, line: 6, type: !23) +!23 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 32) +!25 = !DILocation(line: 0, scope: !7) +!27 = !DILocation(line: 5, column: 3, scope: !7) +!28 = !DILocation(line: 5, column: 8, scope: !7) +!32 = !DILocation(line: 6, column: 16, scope: !7) +!33 = !DILocation(line: 7, column: 33, scope: !34) +!34 = distinct !DILexicalBlock(scope: !7, file: !1, line: 7, column: 7) +!35 = !DILocation(line: 7, column: 7, scope: !34) +!36 = !DILocation(line: 7, column: 7, scope: !7) +!37 = !DILocation(line: 8, column: 5, scope: !38) +!38 = distinct !DILexicalBlock(scope: !39, file: !1, line: 8, column: 5) +!39 = distinct !DILexicalBlock(scope: !34, file: !1, line: 8, column: 5) +!40 = distinct !{!40, !41, !42, !43} +!41 = !DILocation(line: 8, column: 5, scope: !39) +!42 = !DILocation(line: 9, column: 7, scope: !39) +!43 = !{!"llvm.loop.unroll.disable"} +!44 = !DILocation(line: 10, column: 1, scope: !7) diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll index c2a009f06b89df..502249a87c4892 100644 --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -937,6 +937,56 @@ define <16 x i32> @zext_mulhuw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) { ret <16 x i32> %d } +; PR109790 +define <16 x i16> @zext_mulhuw_v16i16_negative_constant(<16 x i16> %a) { +; SSE-LABEL: zext_mulhuw_v16i16_negative_constant: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [64536,64536,64536,64536,64536,64536,64536,64536] +; SSE-NEXT: pmulhw %xmm2, %xmm0 +; SSE-NEXT: pmulhw %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: zext_mulhuw_v16i16_negative_constant: +; AVX: # %bb.0: +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536] +; AVX-NEXT: retq + %k = and <16 x i16> %a, + %x = zext nneg <16 x i16> %k to <16 x i32> + %m = mul nsw <16 x i32> %x, + %s = lshr <16 x i32> %m, + %t = trunc nuw <16 x i32> %s to <16 x i16> + ret <16 x i16> %t +} + +; PR109790 +define <16 x i16> @zext_mulhuw_v16i16_positive_constant(<16 x i16> %a) { +; SSE-LABEL: zext_mulhuw_v16i16_positive_constant: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1000,1000,1000,1000,1000,1000,1000,1000] +; SSE-NEXT: pmulhw %xmm2, %xmm0 +; SSE-NEXT: pmulhw %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: zext_mulhuw_v16i16_positive_constant: +; AVX: # %bb.0: +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000] +; AVX-NEXT: retq + %k = and <16 x i16> %a, + %x = zext nneg <16 x i16> %k to <16 x i32> + %m = mul nuw nsw <16 x i32> %x, + %s = lshr <16 x i32> %m, + %t = trunc nuw nsw <16 x i32> %s to <16 x i16> + ret <16 x i16> %t +} + define <16 x i32> @mulhsw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) { ; SSE2-LABEL: mulhsw_v16i16_lshr: ; SSE2: # %bb.0: @@ -2056,3 +2106,4 @@ define <8 x i16> @sse2_pmulhu_w_const(<8 x i16> %a0, <8 x i16> %a1) { ret <8 x i16> %res } declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) + diff --git a/llvm/test/CodeGen/X86/pr57673.ll b/llvm/test/CodeGen/X86/pr57673.ll index cf7717f420480b..4ca8ae91f9e6fc 100644 --- a/llvm/test/CodeGen/X86/pr57673.ll +++ b/llvm/test/CodeGen/X86/pr57673.ll @@ -37,7 +37,7 @@ define void @foo() { ; NORMAL-NEXT: {{ $}} ; NORMAL-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8) ; NORMAL-NEXT: MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm]] :: (store (s128) into `ptr null`, align 8) - ; NORMAL-NEXT: DBG_VALUE $noreg, $noreg, !3, !DIExpression(), debug-location !8 + ; NORMAL-NEXT: DBG_VALUE_LIST !3, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_plus_uconst, 40, DW_OP_stack_value), %stack.1.i, %stack.1.i, debug-location !8 ; NORMAL-NEXT: [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8) ; NORMAL-NEXT: MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm1]] :: (store (s128) into `ptr null`, align 8) ; NORMAL-NEXT: {{ $}} @@ -76,7 +76,7 @@ define void @foo() { ; INSTRREF-NEXT: {{ $}} ; INSTRREF-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8) ; INSTRREF-NEXT: MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm]] :: (store (s128) into `ptr null`, align 8) - ; INSTRREF-NEXT: DBG_VALUE $noreg, $noreg, !3, !DIExpression(), debug-location !8 + ; INSTRREF-NEXT: DBG_VALUE_LIST !3, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_plus_uconst, 40, DW_OP_stack_value), %stack.1.i, %stack.1.i, debug-location !8 ; INSTRREF-NEXT: [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8) ; INSTRREF-NEXT: MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm1]] :: (store (s128) into `ptr null`, align 8) ; INSTRREF-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll index 9b32005927ace7..61814b48e6b3a3 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll @@ -146,3 +146,18 @@ define <64 x i8> @combine_permi2q_pshufb_as_permi2d_mask(<8 x i64> %a0, <8 x i64 %res2 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res1, <64 x i8> , <64 x i8> zeroinitializer, i64 %m) ret <64 x i8> %res2 } + +; PR109272 +define <64 x i8> @combine_vpermi2var_v64i8_with_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> %a2) { +; CHECK-LABEL: combine_vpermi2var_v64i8_with_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2b %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vpmovb2m %zmm1, %k0 +; CHECK-NEXT: vpmovm2b %k0, %zmm1 +; CHECK-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %perm = tail call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> %a2) + %cmp = icmp slt <64 x i8> %a1, zeroinitializer + %sel = select <64 x i1> %cmp, <64 x i8> zeroinitializer, <64 x i8> %perm + ret <64 x i8> %sel +} diff --git a/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll b/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll index e9c23100eeda52..b31469e899d650 100644 --- a/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll +++ b/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll @@ -1,5 +1,5 @@ ; AIX doesn't currently support DWARF 5 section .debug_rnglists -; XFAIL: target={{.*}}-aix{{.*}} +; XFAIL: target={{.*}}-zos{{.*}}, target={{.*}}-aix{{.*}} ; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-info - | FileCheck %s ; diff --git a/llvm/test/DebugInfo/NVPTX/debug-info.ll b/llvm/test/DebugInfo/NVPTX/debug-info.ll index 9948925db57c9f..922a420820f469 100644 --- a/llvm/test/DebugInfo/NVPTX/debug-info.ll +++ b/llvm/test/DebugInfo/NVPTX/debug-info.ll @@ -25,6 +25,10 @@ ; CHECK-DAG: .reg .b64 %rd<8>; ; CHECK: .loc [[DEBUG_INFO_CU:[0-9]+]] 5 0 ; CHECK: ld.param.u32 %r{{.+}}, [{{.+}}]; +; CHECK: ld.param.u64 %rd{{.+}}, [{{.+}}]; +; CHECK: cvta.to.global.u64 %rd{{.+}}, %rd{{.+}}; +; CHECK: ld.param.u64 %rd{{.+}}, [{{.+}}]; +; CHECK: cvta.to.global.u64 %rd{{.+}}, %rd{{.+}}; ; CHECK: .loc [[BUILTUIN_VARS_H:[0-9]+]] 78 180 ; CHECK: mov.u32 %r{{.+}}, %ctaid.x; ; CHECK: .loc [[BUILTUIN_VARS_H]] 89 180 @@ -38,10 +42,6 @@ ; CHECK: .loc [[DEBUG_INFO_CU]] 7 7 ; CHECK: @%p{{.+}} bra [[BB:\$L__.+]]; ; CHECK: ld.param.f32 %f{{.+}}, [{{.+}}]; -; CHECK: ld.param.u64 %rd{{.+}}, [{{.+}}]; -; CHECK: cvta.to.global.u64 %rd{{.+}}, %rd{{.+}}; -; CHECK: ld.param.u64 %rd{{.+}}, [{{.+}}]; -; CHECK: cvta.to.global.u64 %rd{{.+}}, %rd{{.+}}; ; CHECK: .loc [[DEBUG_INFO_CU]] 8 13 ; CHECK: mul.wide.u32 %rd{{.+}}, %r{{.+}}, 4; ; CHECK: add.s64 %rd{{.+}}, %rd{{.+}}, %rd{{.+}}; @@ -2661,22 +2661,22 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT:.b32 4579 // DW_AT_type ; CHECK-NEXT:.b8 25 // Abbrev [25] 0x8aa:0x18 DW_TAG_inlined_subroutine ; CHECK-NEXT:.b32 707 // DW_AT_abstract_origin -; CHECK-NEXT:.b64 $L__tmp0 // DW_AT_low_pc -; CHECK-NEXT:.b64 $L__tmp1 // DW_AT_high_pc +; CHECK-NEXT:.b64 $L__tmp1 // DW_AT_low_pc +; CHECK-NEXT:.b64 $L__tmp2 // DW_AT_high_pc ; CHECK-NEXT:.b8 1 // DW_AT_call_file ; CHECK-NEXT:.b8 6 // DW_AT_call_line ; CHECK-NEXT:.b8 11 // DW_AT_call_column ; CHECK-NEXT:.b8 25 // Abbrev [25] 0x8c2:0x18 DW_TAG_inlined_subroutine ; CHECK-NEXT:.b32 1466 // DW_AT_abstract_origin -; CHECK-NEXT:.b64 $L__tmp1 // DW_AT_low_pc -; CHECK-NEXT:.b64 $L__tmp2 // DW_AT_high_pc +; CHECK-NEXT:.b64 $L__tmp2 // DW_AT_low_pc +; CHECK-NEXT:.b64 $L__tmp3 // DW_AT_high_pc ; CHECK-NEXT:.b8 1 // DW_AT_call_file ; CHECK-NEXT:.b8 6 // DW_AT_call_line ; CHECK-NEXT:.b8 24 // DW_AT_call_column ; CHECK-NEXT:.b8 25 // Abbrev [25] 0x8da:0x18 DW_TAG_inlined_subroutine ; CHECK-NEXT:.b32 2060 // DW_AT_abstract_origin -; CHECK-NEXT:.b64 $L__tmp2 // DW_AT_low_pc -; CHECK-NEXT:.b64 $L__tmp3 // DW_AT_high_pc +; CHECK-NEXT:.b64 $L__tmp3 // DW_AT_low_pc +; CHECK-NEXT:.b64 $L__tmp4 // DW_AT_high_pc ; CHECK-NEXT:.b8 1 // DW_AT_call_file ; CHECK-NEXT:.b8 6 // DW_AT_call_line ; CHECK-NEXT:.b8 37 // DW_AT_call_column diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_PC8.s b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_PC.s similarity index 61% rename from llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_PC8.s rename to llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_PC.s index 46b851a836abb8..d88875e3855114 100644 --- a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_PC8.s +++ b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_PC.s @@ -2,7 +2,7 @@ # RUN: -filetype=obj -o %t.o %s # RUN: llvm-jitlink -noexec %t.o # -# Check R_X86_64_PC8 handling. +# Check R_X86_64_PC* handling. .text .globl main @@ -14,3 +14,6 @@ main: .rodata .byte main-. # Generate R_X86_64_PC8 relocation. + .short main-. # Generate R_X86_64_PC16 relocation. + .long main-. # Generate R_X86_64_PC32 relocation. + .quad main-. # Generate R_X86_64_PC64 relocation. diff --git a/llvm/test/TableGen/listflatten-error.td b/llvm/test/TableGen/listflatten-error.td new file mode 100644 index 00000000000000..56062420982a11 --- /dev/null +++ b/llvm/test/TableGen/listflatten-error.td @@ -0,0 +1,6 @@ +// RUN: not llvm-tblgen %s 2>&1 | FileCheck %s -DFILE=%s + +// CHECK: [[FILE]]:[[@LINE+2]]:33: error: expected list type argument in unary operator +class Flatten { + list F = !listflatten(A); +} diff --git a/llvm/test/TableGen/listflatten.td b/llvm/test/TableGen/listflatten.td new file mode 100644 index 00000000000000..bc9b1c71ea88d7 --- /dev/null +++ b/llvm/test/TableGen/listflatten.td @@ -0,0 +1,32 @@ +// RUN: llvm-tblgen %s | FileCheck %s + +class Flatten A, list B> { + list Flat1 = !listflatten([A, B, [6], [7, 8]]); + + list> X = [A, B]; + list Flat2 = !listflatten(!listconcat(X, [[7]])); + + // Generate a nested list of integers. + list Y0 = [1, 2, 3, 4]; + list> Y1 = !foreach(elem, Y0, [elem]); + list>> Y2 = !foreach(elem, Y1, [elem]); + list>>> Y3 = !foreach(elem, Y2, [elem]); + + // Flatten it completely. + list Flat3=!listflatten(!listflatten(!listflatten(Y3))); + + // Flatten it partially. + list>> Flat4 = !listflatten(Y3); + list> Flat5 = !listflatten(!listflatten(Y3)); + + // Test NOP flattening. + list Flat6 = !listflatten(["a", "b"]); +} + +// CHECK: list Flat1 = [1, 2, 3, 4, 5, 6, 7, 8]; +// CHECK: list Flat2 = [1, 2, 3, 4, 5, 7]; +// CHECK: list Flat3 = [1, 2, 3, 4]; +// CHECK{LITERAL}: list>> Flat4 = [[[1]], [[2]], [[3]], [[4]]]; +// CHECK: list Flat6 = ["a", "b"]; +def F : Flatten<[1,2], [3,4,5]>; + diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll index c38f81d0f046ef..cba1ba8dde768e 100644 --- a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll +++ b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll @@ -300,3 +300,126 @@ define void @self-reference() { end: ret void } + +define void @pr106083_invalidBBarg_fold(i1 %cmp1, i1 %cmp2, i1 %not, ptr %d) { +; CHECK-LABEL: @pr106083_invalidBBarg_fold( +; CHECK-NEXT: bb: +; CHECK-NEXT: br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[SEL_SI_UNFOLD_FALSE:%.*]] +; CHECK: sel.si.unfold.false: +; CHECK-NEXT: [[DOTSI_UNFOLD_PHI1:%.*]] = phi i32 [ 1, [[BB:%.*]] ] +; CHECK-NEXT: br label [[BB1]] +; CHECK: BB1: +; CHECK-NEXT: [[I:%.*]] = phi i16 [ 0, [[BB1_BACKEDGE:%.*]] ], [ 0, [[BB]] ], [ 1, [[BB7:%.*]] ], [ 0, [[SEL_SI_UNFOLD_FALSE]] ], [ 1, [[BB7_JT0:%.*]] ] +; CHECK-NEXT: [[SEL_SI_UNFOLD_PHI:%.*]] = phi i32 [ [[SEL_SI_UNFOLD_PHI]], [[BB1_BACKEDGE]] ], [ [[SEL_SI_UNFOLD_PHI]], [[BB7]] ], [ 0, [[BB]] ], [ [[DOTSI_UNFOLD_PHI1]], [[SEL_SI_UNFOLD_FALSE]] ], [ [[SEL_SI_UNFOLD_PHI]], [[BB7_JT0]] ] +; CHECK-NEXT: br i1 [[NOT:%.*]], label [[BB7_JT0]], label [[BB2:%.*]] +; CHECK: BB2: +; CHECK-NEXT: store i16 0, ptr [[D:%.*]], align 2 +; CHECK-NEXT: br i1 [[CMP2:%.*]], label [[BB7]], label [[SPEC_SELECT_SI_UNFOLD_FALSE_JT0:%.*]] +; CHECK: spec.select.si.unfold.false: +; CHECK-NEXT: br label [[BB7]] +; CHECK: spec.select.si.unfold.false.jt0: +; CHECK-NEXT: [[DOTSI_UNFOLD_PHI_JT0:%.*]] = phi i32 [ 0, [[BB2]] ] +; CHECK-NEXT: br label [[BB7_JT0]] +; CHECK: BB7: +; CHECK-NEXT: [[D_PROMOTED4:%.*]] = phi i16 [ 1, [[BB2]] ], [ 1, [[SPEC_SELECT_SI_UNFOLD_FALSE:%.*]] ] +; CHECK-NEXT: [[_3:%.*]] = phi i32 [ [[SEL_SI_UNFOLD_PHI]], [[BB2]] ], [ poison, [[SPEC_SELECT_SI_UNFOLD_FALSE]] ] +; CHECK-NEXT: switch i32 [[_3]], label [[BB1_BACKEDGE]] [ +; CHECK-NEXT: i32 0, label [[BB1]] +; CHECK-NEXT: i32 1, label [[BB8:%.*]] +; CHECK-NEXT: ] +; CHECK: BB7.jt0: +; CHECK-NEXT: [[D_PROMOTED4_JT0:%.*]] = phi i16 [ 0, [[BB1]] ], [ 1, [[SPEC_SELECT_SI_UNFOLD_FALSE_JT0]] ] +; CHECK-NEXT: [[_3_JT0:%.*]] = phi i32 [ 0, [[BB1]] ], [ [[DOTSI_UNFOLD_PHI_JT0]], [[SPEC_SELECT_SI_UNFOLD_FALSE_JT0]] ] +; CHECK-NEXT: br label [[BB1]] +; CHECK: BB1.backedge: +; CHECK-NEXT: br label [[BB1]] +; CHECK: BB8: +; CHECK-NEXT: ret void +; +bb: + %sel = select i1 %cmp1, i32 0, i32 1 + br label %BB1 + +BB1: ; preds = %BB1.backedge, %BB7, %bb + %i = phi i16 [ 0, %BB1.backedge ], [ 0, %bb ], [ 1, %BB7 ] + br i1 %not, label %BB7, label %BB2 + +BB2: ; preds = %BB1 + store i16 0, ptr %d, align 2 + %spec.select = select i1 %cmp2, i32 %sel, i32 0 + br label %BB7 + +BB7: ; preds = %BB2, %BB1 + %d.promoted4 = phi i16 [ 0, %BB1 ], [ 1, %BB2 ] + %_3 = phi i32 [ 0, %BB1 ], [ %spec.select, %BB2 ] + switch i32 %_3, label %BB1.backedge [ + i32 0, label %BB1 + i32 1, label %BB8 + ] + +BB1.backedge: ; preds = %BB7 + br label %BB1 + +BB8: ; preds = %BB7 + ret void +} + +define void @pr106083_select_dead_uses(i1 %cmp1, i1 %not, ptr %p) { +; CHECK-LABEL: @pr106083_select_dead_uses( +; CHECK-NEXT: bb: +; CHECK-NEXT: br i1 [[CMP1:%.*]], label [[DOTLOOPEXIT6:%.*]], label [[SPEC_SELECT_SI_UNFOLD_FALSE:%.*]] +; CHECK: spec.select.si.unfold.false: +; CHECK-NEXT: [[DOTSI_UNFOLD_PHI1:%.*]] = phi i32 [ 1, [[BB:%.*]] ] +; CHECK-NEXT: br label [[DOTLOOPEXIT6]] +; CHECK: .loopexit6: +; CHECK-NEXT: [[SPEC_SELECT_SI_UNFOLD_PHI:%.*]] = phi i32 [ [[SPEC_SELECT_SI_UNFOLD_PHI]], [[SELECT_UNFOLD:%.*]] ], [ 0, [[BB]] ], [ [[DOTSI_UNFOLD_PHI1]], [[SPEC_SELECT_SI_UNFOLD_FALSE]] ] +; CHECK-NEXT: br i1 [[NOT:%.*]], label [[SELECT_UNFOLD_JT0:%.*]], label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[I:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[NOT2:%.*]] = icmp eq i32 0, 0 +; CHECK-NEXT: br i1 [[NOT2]], label [[SELECT_UNFOLD]], label [[SPEC_SELECT7_SI_UNFOLD_FALSE_JT0:%.*]] +; CHECK: spec.select7.si.unfold.false: +; CHECK-NEXT: br label [[SELECT_UNFOLD]] +; CHECK: spec.select7.si.unfold.false.jt0: +; CHECK-NEXT: [[DOTSI_UNFOLD_PHI_JT0:%.*]] = phi i32 [ 0, [[BB1]] ] +; CHECK-NEXT: br label [[SELECT_UNFOLD_JT0]] +; CHECK: select.unfold: +; CHECK-NEXT: [[_2:%.*]] = phi i32 [ [[SPEC_SELECT_SI_UNFOLD_PHI]], [[BB1]] ], [ poison, [[SPEC_SELECT7_SI_UNFOLD_FALSE:%.*]] ] +; CHECK-NEXT: switch i32 [[_2]], label [[BB2:%.*]] [ +; CHECK-NEXT: i32 0, label [[DOTPREHEADER_PREHEADER:%.*]] +; CHECK-NEXT: i32 1, label [[DOTLOOPEXIT6]] +; CHECK-NEXT: ] +; CHECK: select.unfold.jt0: +; CHECK-NEXT: [[_2_JT0:%.*]] = phi i32 [ 0, [[DOTLOOPEXIT6]] ], [ [[DOTSI_UNFOLD_PHI_JT0]], [[SPEC_SELECT7_SI_UNFOLD_FALSE_JT0]] ] +; CHECK-NEXT: br label [[DOTPREHEADER_PREHEADER]] +; CHECK: .preheader.preheader: +; CHECK-NEXT: ret void +; CHECK: bb2: +; CHECK-NEXT: unreachable +; +bb: + %spec.select = select i1 %cmp1, i32 0, i32 1 + br label %.loopexit6 + +.loopexit6: ; preds = %select.unfold, %bb + br i1 %not, label %select.unfold, label %bb1 + +bb1: ; preds = %.loopexit6 + %i = load i32, ptr %p, align 4 + %not2 = icmp eq i32 0, 0 + %spec.select7 = select i1 %not2, i32 %spec.select, i32 0 + br label %select.unfold + +select.unfold: ; preds = %bb1, %.loopexit6 + %_2 = phi i32 [ 0, %.loopexit6 ], [ %spec.select7, %bb1 ] + switch i32 %_2, label %bb2 [ + i32 0, label %.preheader.preheader + i32 1, label %.loopexit6 + ] + +.preheader.preheader: ; preds = %select.unfold + ret void + +bb2: ; preds = %select.unfold + unreachable +} diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-insr.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-insr.ll new file mode 100644 index 00000000000000..e8489c5be85c41 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-insr.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define @insr_val_into_splatted_val_int(i8 %a) #0 { +; CHECK-LABEL: @insr_val_into_splatted_val_int( +; CHECK-NEXT: [[T0:%.*]] = insertelement poison, i8 [[A:%.*]], i64 0 +; CHECK-NEXT: [[T1:%.*]] = shufflevector [[T0]], poison, zeroinitializer +; CHECK-NEXT: ret [[T1]] +; + %t0 = insertelement poison, i8 %a, i64 0 + %t1 = shufflevector %t0, poison, zeroinitializer + %t2 = tail call @llvm.aarch64.sve.insr.nxv16i8( %t1, i8 %a) + ret %t2 +} + +define @insr_five_into_fives() #0 { +; CHECK-LABEL: @insr_five_into_fives( +; CHECK-NEXT: ret shufflevector ( insertelement ( poison, i16 5, i64 0), poison, zeroinitializer) +; + %t1 = tail call @llvm.aarch64.sve.insr.nxv8i16( splat (i16 5), i16 5) + ret %t1 +} + +define @insr_val_into_splatted_val_fp(float %a) #0 { +; CHECK-LABEL: @insr_val_into_splatted_val_fp( +; CHECK-NEXT: [[T0:%.*]] = insertelement poison, float [[A:%.*]], i64 0 +; CHECK-NEXT: [[T1:%.*]] = shufflevector [[T0]], poison, zeroinitializer +; CHECK-NEXT: ret [[T1]] +; + %t0 = insertelement poison, float %a, i64 0 + %t1 = shufflevector %t0, poison, zeroinitializer + %t2 = tail call @llvm.aarch64.sve.insr.nxv4f32( %t1, float %a) + ret %t2 +} + +define @insr_zero_into_zero() #0 { +; CHECK-LABEL: @insr_zero_into_zero( +; CHECK-NEXT: ret zeroinitializer +; + %t1 = tail call @llvm.aarch64.sve.insr.nxv2f64( zeroinitializer, double zeroinitializer) + ret %t1 +} + +define @insr_val_into_splatted_other(i8 %a, i8 %b) #0 { +; CHECK-LABEL: @insr_val_into_splatted_other( +; CHECK-NEXT: [[T0:%.*]] = insertelement poison, i8 [[B:%.*]], i64 0 +; CHECK-NEXT: [[T1:%.*]] = shufflevector [[T0]], poison, zeroinitializer +; CHECK-NEXT: [[T2:%.*]] = tail call @llvm.aarch64.sve.insr.nxv16i8( [[T1]], i8 [[A:%.*]]) +; CHECK-NEXT: ret [[T2]] +; + %t0 = insertelement poison, i8 %b, i64 0 + %t1 = shufflevector %t0, poison, zeroinitializer + %t2 = tail call @llvm.aarch64.sve.insr.nxv16i8( %t1, i8 %a) + ret %t2 +} + +define @insr_three_into_fives() #0 { +; CHECK-LABEL: @insr_three_into_fives( +; CHECK-NEXT: [[T1:%.*]] = tail call @llvm.aarch64.sve.insr.nxv8i16( shufflevector ( insertelement ( poison, i16 5, i64 0), poison, zeroinitializer), i16 3) +; CHECK-NEXT: ret [[T1]] +; + %t1 = tail call @llvm.aarch64.sve.insr.nxv8i16( splat (i16 5), i16 3) + ret %t1 +} + +declare @llvm.aarch64.sve.insr.nxv16i8(, i8) +declare @llvm.aarch64.sve.insr.nxv8i16(, i16) +declare @llvm.aarch64.sve.insr.nxv4f32(, float) +declare @llvm.aarch64.sve.insr.nxv2f64(, double) + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/select-profitability.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/select-profitability.ll new file mode 100644 index 00000000000000..4496b19fa200c5 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/select-profitability.ll @@ -0,0 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux -mattr=+v < %s | FileCheck %s + +define i32 @pow2_zero_constant_shift(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i16 zeroext %d) { +; CHECK-LABEL: define i32 @pow2_zero_constant_shift( +; CHECK-SAME: i16 zeroext [[A:%.*]], i16 zeroext [[B:%.*]], i16 zeroext [[C:%.*]], i16 zeroext [[D:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[B]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[C]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[D]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i16> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> , <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: ret i32 [[TMP7]] +; + %t39.i0 = icmp eq i16 %a, 1 + %t39.i1 = icmp eq i16 %b, 1 + %t39.i2 = icmp eq i16 %c, 1 + %t39.i3 = icmp eq i16 %d, 1 + %t40.i0 = select i1 %t39.i0, i32 65536, i32 0 + %t40.i1 = select i1 %t39.i1, i32 65536, i32 0 + %t40.i2 = select i1 %t39.i2, i32 65536, i32 0 + %t40.i3 = select i1 %t39.i3, i32 65536, i32 0 + %or.rdx0 = or i32 %t40.i0, %t40.i1 + %or.rdx1 = or i32 %t40.i2, %t40.i3 + %or.rdx2 = or i32 %or.rdx0, %or.rdx1 + ret i32 %or.rdx2 +} + +; TODO: This case is unprofitable, and we should not be vectorizing this. +define i32 @pow2_zero_variable_shift(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i16 zeroext %d) { +; CHECK-LABEL: define i32 @pow2_zero_variable_shift( +; CHECK-SAME: i16 zeroext [[A:%.*]], i16 zeroext [[B:%.*]], i16 zeroext [[C:%.*]], i16 zeroext [[D:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[B]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[C]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[D]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i16> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> , <4 x i32> zeroinitializer +; CHECK-NEXT: [[OR_RDX2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: ret i32 [[OR_RDX2]] +; + %t39.i0 = icmp eq i16 %a, 1 + %t39.i1 = icmp eq i16 %b, 1 + %t39.i2 = icmp eq i16 %c, 1 + %t39.i3 = icmp eq i16 %d, 1 + %t40.i0 = select i1 %t39.i0, i32 524288, i32 0 + %t40.i1 = select i1 %t39.i1, i32 262144, i32 0 + %t40.i2 = select i1 %t39.i2, i32 131072, i32 0 + %t40.i3 = select i1 %t39.i3, i32 65536, i32 0 + %or.rdx0 = or i32 %t40.i0, %t40.i1 + %or.rdx1 = or i32 %t40.i2, %t40.i3 + %or.rdx2 = or i32 %or.rdx0, %or.rdx1 + ret i32 %or.rdx2 +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/peek-through-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/peek-through-shuffle.ll similarity index 85% rename from llvm/test/Transforms/SLPVectorizer/X86/peek-through-shuffle.ll rename to llvm/test/Transforms/SLPVectorizer/peek-through-shuffle.ll index c157f6117df959..839c1ebed6bcff 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/peek-through-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/peek-through-shuffle.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu -o - | FileCheck %s +; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu -o - | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu -o - | FileCheck %s %} define void @foo(ptr %0, <4 x float> %1) { ; CHECK-LABEL: @foo( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll b/llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll similarity index 94% rename from llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll rename to llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll index f376ca71c77693..2037e0d67d2f89 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll +++ b/llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} define i32 @test(ptr %b, ptr %c, i32 %0, ptr %a, i1 %tobool3.not) { ; CHECK-LABEL: define i32 @test( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll b/llvm/test/Transforms/SLPVectorizer/phi-undef-input.ll similarity index 96% rename from llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll rename to llvm/test/Transforms/SLPVectorizer/phi-undef-input.ll index 3cc32c1fc7b28e..b9802a0adb8aaf 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll +++ b/llvm/test/Transforms/SLPVectorizer/phi-undef-input.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer -slp-threshold=-1000 -mtriple=x86_64 -S | FileCheck %s +; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-threshold=-1000 -mtriple=x86_64 -S | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-threshold=-1000 -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s %} ; The inputs to vector phi should remain undef. diff --git a/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll b/llvm/test/Transforms/SLPVectorizer/postponed_gathers.ll similarity index 90% rename from llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll rename to llvm/test/Transforms/SLPVectorizer/postponed_gathers.ll index 488ca0b23cd9c5..f6bed797b9ba91 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll +++ b/llvm/test/Transforms/SLPVectorizer/postponed_gathers.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: opt < %s -passes=slp-vectorizer -slp-threshold=-10 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s +; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-threshold=-10 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-threshold=-10 -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s %} define void @foo() { ; CHECK-LABEL: define void @foo() { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/pr31599-inseltpoison.ll similarity index 78% rename from llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll rename to llvm/test/Transforms/SLPVectorizer/pr31599-inseltpoison.ll index 5506f61fe134bd..fe5871d73cd5e2 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/pr31599-inseltpoison.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} define <2 x float> @foo() { ; CHECK-LABEL: @foo( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll b/llvm/test/Transforms/SLPVectorizer/pr31599.ll similarity index 78% rename from llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll rename to llvm/test/Transforms/SLPVectorizer/pr31599.ll index 348656e07c6be4..10b9b224d556e5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll +++ b/llvm/test/Transforms/SLPVectorizer/pr31599.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} define <2 x float> @foo() { ; CHECK-LABEL: @foo( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll b/llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll similarity index 85% rename from llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll rename to llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll index 03c8767eff327f..f1034f39711351 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll +++ b/llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-sie-ps5 < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-sie-ps5 < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} define void @tes() { ; CHECK-LABEL: define void @tes() { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-modified-values.ll b/llvm/test/Transforms/SLPVectorizer/reduction-modified-values.ll similarity index 83% rename from llvm/test/Transforms/SLPVectorizer/X86/reduction-modified-values.ll rename to llvm/test/Transforms/SLPVectorizer/reduction-modified-values.ll index dbf490c5fe6a2f..be9318e467174a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-modified-values.ll +++ b/llvm/test/Transforms/SLPVectorizer/reduction-modified-values.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} define i32 @test() { ; CHECK-LABEL: @test( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll similarity index 93% rename from llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll rename to llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll index 1a6ff2385905b3..561182d5e4f49d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64 -slp-threshold=-150 | FileCheck %s +; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=x86_64 -slp-threshold=-150 | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-150 | FileCheck %s %} define i1 @test(ptr %arg, ptr %i233, i64 %i241, ptr %i235, ptr %i237, ptr %i227) { ; CHECK-LABEL: @test( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll b/llvm/test/Transforms/SLPVectorizer/reordered-top-scalars.ll similarity index 83% rename from llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll rename to llvm/test/Transforms/SLPVectorizer/reordered-top-scalars.ll index 4517d27598b603..1de5ee2298837a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll +++ b/llvm/test/Transforms/SLPVectorizer/reordered-top-scalars.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown %s -slp-threshold=-5 | FileCheck %s +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown %s -slp-threshold=-5 | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown %s -slp-threshold=-5 | FileCheck %s %} define i32 @test(ptr %isec) { ; CHECK-LABEL: @test( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll b/llvm/test/Transforms/SLPVectorizer/reordering-single-phi.ll similarity index 93% rename from llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll rename to llvm/test/Transforms/SLPVectorizer/reordering-single-phi.ll index bc1eaaac5d1bbc..a70daf9cf8d60c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/reordering-single-phi.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux < %s | FileCheck %s %} @a = external global [32000 x float], align 64 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-buildvector-matching-vectorized-node.ll b/llvm/test/Transforms/SLPVectorizer/reused-buildvector-matching-vectorized-node.ll similarity index 94% rename from llvm/test/Transforms/SLPVectorizer/X86/reused-buildvector-matching-vectorized-node.ll rename to llvm/test/Transforms/SLPVectorizer/reused-buildvector-matching-vectorized-node.ll index 2b425ee624700f..3e00550a885215 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reused-buildvector-matching-vectorized-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/reused-buildvector-matching-vectorized-node.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} define void @blam(ptr %arg, double %load2, i1 %fcmp3) { ; CHECK-LABEL: define void @blam diff --git a/llvm/test/Transforms/SLPVectorizer/revec-fix-109835.ll b/llvm/test/Transforms/SLPVectorizer/revec-fix-109835.ll new file mode 100644 index 00000000000000..965bfc7074c638 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/revec-fix-109835.ll @@ -0,0 +1,70 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=slp-vectorizer -S -slp-revec %s | FileCheck %s + +@b = external dso_local local_unnamed_addr global i64, align 8 +@d = external dso_local local_unnamed_addr global i32, align 4 +@c = external dso_local local_unnamed_addr global i32, align 4 +@a = external dso_local local_unnamed_addr global i8, align 2 + +define void @e() { +; CHECK-LABEL: @e( +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: [[C_PROMOTED5:%.*]] = load i32, ptr @c, align 4 +; CHECK-NEXT: [[A_PROMOTED7:%.*]] = load i8, ptr @a, align 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[C_PROMOTED5]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i8> , i8 [[A_PROMOTED7]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i32> [[DOTSPLAT]], +; CHECK-NEXT: [[TMP2:%.*]] = add <16 x i32> [[DOTSPLAT]], +; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[DOTSPLAT]], +; CHECK-NEXT: [[INDUCTION:%.*]] = add <16 x i32> [[DOTSPLAT]], +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <16 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <16 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP6:%.*]] = icmp ult <16 x i32> [[TMP3]], +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult <16 x i32> [[INDUCTION]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <16 x i32> [[DOTSPLAT]], +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i1> [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i1> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = zext <16 x i1> [[TMP12]] to <16 x i8> +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i8> [[TMP0]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> , <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> [[TMP15]]) +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[C_PROMOTED5]], 81 +; CHECK-NEXT: store i64 -1, ptr @b, align 8 +; CHECK-NEXT: store i32 9, ptr @d, align 4 +; CHECK-NEXT: store i32 [[TMP17]], ptr @c, align 4 +; CHECK-NEXT: store i8 [[TMP16]], ptr @a, align 2 +; CHECK-NEXT: ret void +; +vector.ph: + %c.promoted5 = load i32, ptr @c, align 4 + %a.promoted7 = load i8, ptr @a, align 2 + %.splatinsert = insertelement <16 x i32> poison, i32 %c.promoted5, i64 0 + %.splat = shufflevector <16 x i32> %.splatinsert, <16 x i32> poison, <16 x i32> zeroinitializer + %0 = insertelement <16 x i8> , i8 %a.promoted7, i64 0 + %1 = add <16 x i32> %.splat, + %2 = add <16 x i32> %.splat, + %3 = add <16 x i32> %.splat, + %induction = add <16 x i32> %.splat, + %4 = icmp ult <16 x i32> %1, + %5 = icmp ult <16 x i32> %2, + %6 = icmp ult <16 x i32> %3, + %7 = icmp ult <16 x i32> %induction, + %8 = icmp eq <16 x i32> %.splat, + %9 = or <16 x i1> %4, %5 + %10 = or <16 x i1> %9, %6 + %11 = or <16 x i1> %10, %7 + %12 = or <16 x i1> %11, %8 + %13 = zext <16 x i1> %12 to <16 x i8> + %14 = or <16 x i8> %0, %13 + %15 = shufflevector <16 x i8> %14, <16 x i8> , <16 x i32> + %16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %15) + %17 = add i32 %c.promoted5, 81 + store i64 -1, ptr @b, align 8 + store i32 9, ptr @d, align 4 + store i32 %17, ptr @c, align 4 + store i8 %16, ptr @a, align 2 + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll b/llvm/test/Transforms/SLPVectorizer/root-trunc-extract-reuse.ll similarity index 86% rename from llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll rename to llvm/test/Transforms/SLPVectorizer/root-trunc-extract-reuse.ll index af46b4f576234b..34c068478c5f5e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/root-trunc-extract-reuse.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=x86_64 < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=x86_64 < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} define i1 @test() { ; CHECK-LABEL: @test( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll b/llvm/test/Transforms/SLPVectorizer/same-scalar-in-same-phi-extract.ll similarity index 88% rename from llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll rename to llvm/test/Transforms/SLPVectorizer/same-scalar-in-same-phi-extract.ll index f1be11d0d0fc51..fe0813542f3093 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll +++ b/llvm/test/Transforms/SLPVectorizer/same-scalar-in-same-phi-extract.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S --passes=slp-vectorizer -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -slp-threshold=-99999 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} define void @test(i32 %arg) { ; CHECK-LABEL: define void @test( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll b/llvm/test/Transforms/SLPVectorizer/scalarazied-result.ll similarity index 60% rename from llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll rename to llvm/test/Transforms/SLPVectorizer/scalarazied-result.ll index 1d6e191c6f97bf..2570cdb45e1e78 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll +++ b/llvm/test/Transforms/SLPVectorizer/scalarazied-result.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -S < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -S < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -S < %s | FileCheck %s %} define void @test() { ; CHECK-LABEL: @test( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/scalarization-overhead.ll similarity index 92% rename from llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll rename to llvm/test/Transforms/SLPVectorizer/scalarization-overhead.ll index 55e155840f8588..9f6b285f1ab90a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll +++ b/llvm/test/Transforms/SLPVectorizer/scalarization-overhead.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64-- -passes=slp-vectorizer -S < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -mtriple=x86_64-- -passes=slp-vectorizer -S < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -mtriple=aarch64-- -passes=slp-vectorizer -S < %s | FileCheck %s %} ; Crash Test case reported on D134605 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll b/llvm/test/Transforms/SLPVectorizer/shrink_after_reorder2.ll similarity index 91% rename from llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll rename to llvm/test/Transforms/SLPVectorizer/shrink_after_reorder2.ll index 9e3ba05f88da8d..2f0bd4a8f1315c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll +++ b/llvm/test/Transforms/SLPVectorizer/shrink_after_reorder2.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -o - -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -S -o - -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S -o - -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} %class.e = type { i32, i32 } %struct.a = type { i32, i32, i32, i32 } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll b/llvm/test/Transforms/SLPVectorizer/shuffle-multivector.ll similarity index 89% rename from llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll rename to llvm/test/Transforms/SLPVectorizer/shuffle-multivector.ll index c2555889f59816..2253c70dc25015 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll +++ b/llvm/test/Transforms/SLPVectorizer/shuffle-multivector.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-163 | FileCheck %s +; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-163 | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=aarch64-unknown-linux -slp-threshold=-163 | FileCheck %s %} define void @test1(i128 %p0, i128 %p1, i128 %p2, i128 %p3, <4 x i128> %vec) { ; CHECK-LABEL: @test1( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shufflebuilder-bug.ll b/llvm/test/Transforms/SLPVectorizer/shufflebuilder-bug.ll similarity index 89% rename from llvm/test/Transforms/SLPVectorizer/X86/shufflebuilder-bug.ll rename to llvm/test/Transforms/SLPVectorizer/shufflebuilder-bug.ll index 9db7d696c7c7eb..019c9eadd7c096 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shufflebuilder-bug.ll +++ b/llvm/test/Transforms/SLPVectorizer/shufflebuilder-bug.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: opt -S -p slp-vectorizer -mtriple=x86_64-- %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -S -p slp-vectorizer -mtriple=x86_64-- %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S -p slp-vectorizer -mtriple=aarch64-unknown-linux-gnu %s | FileCheck %s %} define void @foo(<4 x float> %vec, float %val, ptr %ptr) { ; CHECK-LABEL: define void @foo diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll b/llvm/test/Transforms/SLPVectorizer/stores-non-ordered.ll similarity index 92% rename from llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll rename to llvm/test/Transforms/SLPVectorizer/stores-non-ordered.ll index a9748ca6291ae2..aaa6be73056bd4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll +++ b/llvm/test/Transforms/SLPVectorizer/stores-non-ordered.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -S -mtriple=x86_64-unknown -passes=slp-vectorizer -slp-min-reg-size=64 -slp-threshold=-1000 | FileCheck %s +; RUN: %if x86-registered-target %{ opt < %s -S -mtriple=x86_64-unknown -passes=slp-vectorizer -slp-min-reg-size=64 -slp-threshold=-1000 | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt < %s -S -mtriple=aarch64-unknown -passes=slp-vectorizer -slp-min-reg-size=64 -slp-threshold=-1000 | FileCheck %s %} define i32 @non-ordered-stores(ptr noalias nocapture %in, ptr noalias nocapture %inn, ptr noalias nocapture %out) { ; CHECK-LABEL: @non-ordered-stores( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/unknown-entries.ll b/llvm/test/Transforms/SLPVectorizer/unknown-entries.ll similarity index 82% rename from llvm/test/Transforms/SLPVectorizer/X86/unknown-entries.ll rename to llvm/test/Transforms/SLPVectorizer/unknown-entries.ll index fc22280c2b8ada..ca9aa451a9a3ae 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/unknown-entries.ll +++ b/llvm/test/Transforms/SLPVectorizer/unknown-entries.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt < %s -passes=slp-vectorizer -S | FileCheck %s - -target triple = "x86_64-unknown-linux-gnu" +; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s %} define <3 x i64> @ahyes(i64 %position, i64 %value) { ; CHECK-LABEL: define <3 x i64> @ahyes( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/zext-incoming-for-neg-icmp.ll b/llvm/test/Transforms/SLPVectorizer/zext-incoming-for-neg-icmp.ll similarity index 89% rename from llvm/test/Transforms/SLPVectorizer/X86/zext-incoming-for-neg-icmp.ll rename to llvm/test/Transforms/SLPVectorizer/zext-incoming-for-neg-icmp.ll index 7f086d17ca4c08..89fcc7e983749b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/zext-incoming-for-neg-icmp.ll +++ b/llvm/test/Transforms/SLPVectorizer/zext-incoming-for-neg-icmp.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} define i32 @test(i32 %a, i8 %b, i8 %c) { ; CHECK-LABEL: define i32 @test( diff --git a/llvm/test/tools/dsymutil/X86/dwarf5-many-include-directories.test b/llvm/test/tools/dsymutil/X86/dwarf5-many-include-directories.test new file mode 100644 index 00000000000000..644eecd26d8afe --- /dev/null +++ b/llvm/test/tools/dsymutil/X86/dwarf5-many-include-directories.test @@ -0,0 +1,213 @@ +# RUN: rm -rf %t && mkdir -p %t +# RUN: split-file %s %t +# RUN: %python %t/all.py > %t/all.ll +# RUN: sed 's@---TEMPORARY_DIR---@%{/t:regex_replacement}@' %t/debug.map.template > %t/debug.map +# RUN: %llc_dwarf -mtriple x86_64-apple-macosx10.4.0 -o %t/all.o -filetype=obj %t/all.ll +# RUN: dsymutil -f -y %t/debug.map -o - | llvm-dwarfdump -debug-line - | FileCheck %s +# RUN: dsymutil --linker parallel -f -y %t/debug.map -o - | llvm-dwarfdump -debug-line - | tee %t/output.txt | FileCheck %s + +# CHECK: include_directories[255] = "/tmp/tmp.0HPkdttdoU/d254" +# CHECK-NEXT: include_directories[256] = "/tmp/tmp.0HPkdttdoU/d255" +# CHECK-NEXT: include_directories[257] = "/tmp/tmp.0HPkdttdoU/d256" + +# CHECK: dir_index: 255 +# CHECK: dir_index: 256 +# CHECK: dir_index: 257 + +# Original file generated doing the following (fish shell): +# - for cnt in (seq 0 256); mkdir -p d$cnt ; printf "void func$cnd() {}\n#define FUNC$cnt func$cnt()\n" >> d$cnt/f$cnt.c ; end +# - for cnt in (seq 0 256); printf "#include \"f$cnt.c\"" >> all.c ; end +# - printf "void all() {\n" >> all.c +# - for cnt in (seq 0 256); printf "FUNC$cnt;\n" >> all.c ; end +# - printf "}\n" >> all.c +# - clang -target x86_64-apple-macos -S -emit-llvm -gdwarf-5 -o all.ll all.c (for cnt in (seq 0 256); echo "-Id$cnt"; end) +# - Edit all.ll manually and change all DIFile so the directory in filename is +# moved into the directory field. +# - Transformed into Python manually. + +#--- all.py +import math +import string + +PROLOGUE = string.Template("""\ +; ModuleID = 'all.c' +source_filename = "all.c" +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.4.0" +""") + +FUNCTION = string.Template("""\ +; Function Attrs: noinline nounwind optnone uwtable +define void @func$idx() #0 !dbg !$dbg_reference_subprogram { + ret void, !dbg !$dbg_reference_location_ret +} +""") + +ALL_FUNCTION_PROLOGUE = string.Template("""\ +; Function Attrs: noinline nounwind optnone uwtable +define void @all() #0 !dbg !$dbg_reference_subprogram { +""") + +ALL_FUNCTION_CALL = string.Template("""\ + call void @func$idx(), !dbg !$dbg_reference_location_call +""") + +ALL_FUNCTION_EPILOGUE = string.Template("""\ + ret void, !dbg !$dbg_reference_location_ret +} +""") + +DWARF_PROLOGUE = string.Template("""\ +attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cmov,+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+ssse3,+x87" "tune-cpu"="generic" } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7} +!llvm.ident = !{!8} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 18.1.6 (CentOS 18.1.6-3.el9)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/") +!1 = !DIFile(filename: "all.c", directory: "/tmp/tmp.0HPkdttdoU", checksumkind: CSK_MD5, checksum: "8b5068f097f0c272ddc808ed2d82cb12") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 8, !"PIC Level", i32 2} +!6 = !{i32 7, !"uwtable", i32 2} +!7 = !{i32 7, !"frame-pointer", i32 2} +!8 = !{!"clang version 18.1.6 (CentOS 18.1.6-3.el9)"} +""") + +DWARF_FUNCTION_WITH_TYPE = string.Template("""\ +!$dbg_reference_subprogram = distinct !DISubprogram(name: "func$idx", scope: !$dbg_reference_file, file: !$dbg_reference_file, line: 1, type: !11, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0) +!$dbg_reference_file = !DIFile(filename: "f$idx.c", directory: "/tmp/tmp.0HPkdttdoU/d$idx", checksumkind: CSK_MD5, checksum: "01234567890123456789012345678901") +!11 = !DISubroutineType(types: !12) +!12 = !{null} +!$dbg_reference_location = !DILocation(line: 1, column: $column, scope: !$dbg_reference_subprogram) +""") + +DWARF_FUNCTION = string.Template("""\ +!$dbg_reference_subprogram = distinct !DISubprogram(name: "func$idx", scope: !$dbg_reference_file, file: !$dbg_reference_file, line: 1, type: !11, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0) +!$dbg_reference_file = !DIFile(filename: "f$idx.c", directory: "/tmp/tmp.0HPkdttdoU/d$idx", checksumkind: CSK_MD5, checksum: "01234567890123456789012345678901") +!$dbg_reference_location = !DILocation(line: 1, column: $column, scope: !$dbg_reference_subprogram) +""") + +DWARF_ALL_FUNCTION_PROLOGUE = string.Template("""\ +!$dbg_reference_subprogram = distinct !DISubprogram(name: "all", scope: !1, file: !1, line: $line_number, type: !11, scopeLine: $line_number, spFlags: DISPFlagDefinition, unit: !0) +""") + +DWARF_ALL_FUNCTION_LOCATION = string.Template("""\ +!$dbg_reference_location = !DILocation(line: $line_number, column: 1, scope: !$dbg_reference_subprogram) +""") + +NUM_FUNCS = 257 + +dbg_reference_subprogram = 9 +dbg_reference_file = 10 +dbg_reference_location = 13 +column_base = 15 +functions = [] +dwarf_subprograms = [] + +first = True +for idx in range(NUM_FUNCS): + functions.append( + FUNCTION.substitute( + idx=idx, + dbg_reference_subprogram=dbg_reference_subprogram, + dbg_reference_location_ret=dbg_reference_location, + ) + ) + if first: + dwarf_subprograms.append( + DWARF_FUNCTION_WITH_TYPE.substitute( + idx=idx, + dbg_reference_subprogram=dbg_reference_subprogram, + dbg_reference_file=dbg_reference_file, + dbg_reference_location=dbg_reference_location, + column=column_base, + ) + ) + else: + dwarf_subprograms.append( + DWARF_FUNCTION.substitute( + idx=idx, + dbg_reference_subprogram=dbg_reference_subprogram, + dbg_reference_file=dbg_reference_file, + dbg_reference_location=dbg_reference_location, + column=column_base + math.floor(math.log10(idx)), + ) + ) + + dbg_reference_subprogram += 5 if first else 3 + dbg_reference_file += 5 if first else 3 + dbg_reference_location += 3 + first = False + +dbg_reference_location = dbg_reference_subprogram + 1 +line_number = 258 +all_function = [] +dwarf_all_subprogram = [] + +all_function.append( + ALL_FUNCTION_PROLOGUE.substitute( + dbg_reference_subprogram=dbg_reference_subprogram + ) +) +dwarf_all_subprogram.append( + DWARF_ALL_FUNCTION_PROLOGUE.substitute( + dbg_reference_subprogram=dbg_reference_subprogram, + line_number=line_number + ) +) +line_number += 1 + +for idx in range(NUM_FUNCS): + all_function.append( + ALL_FUNCTION_CALL.substitute( + idx=idx, + dbg_reference_location_call=dbg_reference_location, + ) + ) + dwarf_all_subprogram.append( + DWARF_ALL_FUNCTION_LOCATION.substitute( + dbg_reference_location=dbg_reference_location, + line_number=line_number, + dbg_reference_subprogram=dbg_reference_subprogram, + ) + ) + + dbg_reference_location += 1 + line_number += 1 + +all_function.append( + ALL_FUNCTION_EPILOGUE.substitute( + dbg_reference_location_ret=dbg_reference_location + ) +) +dwarf_all_subprogram.append( + DWARF_ALL_FUNCTION_LOCATION.substitute( + dbg_reference_location=dbg_reference_location, + line_number=line_number, + dbg_reference_subprogram=dbg_reference_subprogram, + ) +) + +print(PROLOGUE.substitute()) +for function in functions: + print(function) +for all_function_piece in all_function: + print(all_function_piece, end='') +print() +print(DWARF_PROLOGUE.substitute(), end='') +for dwarf_subprogram in dwarf_subprograms: + print(dwarf_subprogram, end='') +for dwarf_all_subprogram_piece in dwarf_all_subprogram: + print(dwarf_all_subprogram_piece, end='') +print() + +#--- debug.map.template +--- +triple: 'x86_64-apple-darwin' +objects: + - filename: ---TEMPORARY_DIR---/all.o + symbols: + - { sym: _all, objAddr: 0x0, binAddr: 0x0, size: 0x0 } +... diff --git a/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp b/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp index 0fad4ee4360ddf..485f6c7d33d902 100644 --- a/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp +++ b/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp @@ -48,7 +48,8 @@ static cl::opt OutputFilename("output", cl::value_desc("output"), // Save the bitstream profile from the JSON representation. Error convertFromJSON() { - auto BufOrError = MemoryBuffer::getFileOrSTDIN(InputFilename); + auto BufOrError = + MemoryBuffer::getFileOrSTDIN(InputFilename, /*IsText=*/true); if (!BufOrError) return createFileError(InputFilename, BufOrError.getError()); diff --git a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp index 1510e9fb32007e..5409b6dc7459d3 100644 --- a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp +++ b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp @@ -52,6 +52,11 @@ extern cl::OptionCategory LLVMReduceOptions; static cl::opt TargetTriple("mtriple", cl::desc("Set the target triple"), cl::cat(LLVMReduceOptions)); +static cl::opt PrintInvalidMachineReductions( + "print-invalid-reduction-machine-verifier-errors", + cl::desc( + "Print machine verifier errors on invalid reduction attempts triple"), + cl::cat(LLVMReduceOptions)); static cl::opt TmpFilesAsBitcode( "write-tmp-files-as-bitcode", @@ -417,7 +422,7 @@ static std::unique_ptr cloneMF(MachineFunction *SrcMF, DstMRI->freezeReservedRegs(); - DstMF->verify(nullptr, "", /*AbortOnError=*/true); + DstMF->verify(nullptr, "", &errs(), /*AbortOnError=*/true); return DstMF; } @@ -450,8 +455,21 @@ bool ReducerWorkItem::verify(raw_fd_ostream *OS) const { for (const Function &F : getModule()) { if (const MachineFunction *MF = MMI->getMachineFunction(F)) { - if (!MF->verify(nullptr, "", /*AbortOnError=*/false)) + // With the current state of quality, most reduction attempts fail the + // machine verifier. Avoid spamming large function dumps on nearly every + // attempt until the situation is better. + if (!MF->verify(nullptr, "", + /*OS=*/PrintInvalidMachineReductions ? &errs() : nullptr, + /*AbortOnError=*/false)) { + + if (!PrintInvalidMachineReductions) { + WithColor::warning(errs()) + << "reduction attempt on function '" << MF->getName() + << "' failed machine verifier (debug with " + "-print-invalid-reduction-machine-verifier-errors)\n"; + } return true; + } } } diff --git a/llvm/unittests/MI/LiveIntervalTest.cpp b/llvm/unittests/MI/LiveIntervalTest.cpp index 7dcd82f3e7aa61..f910e8e1f2c8fb 100644 --- a/llvm/unittests/MI/LiveIntervalTest.cpp +++ b/llvm/unittests/MI/LiveIntervalTest.cpp @@ -101,7 +101,9 @@ struct TestPassT : public TestPass { bool runOnMachineFunction(MachineFunction &MF) override { AnalysisType &A = getAnalysis(); T(MF, A); - EXPECT_EQ(MF.verify(this, /* Banner */ nullptr, /* AbortOnError */ false), + EXPECT_EQ(MF.verify(this, /* Banner=*/nullptr, + /*OS=*/nullptr, + /* AbortOnError=*/false), ShouldPass); return true; } diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 1fcc9cbea152cd..42df09609b675c 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -1769,6 +1769,7 @@ define void @foo(i8 %v1, ptr %ptr) { store volatile i8 %ld0, ptr %ptr %atomicrmw = atomicrmw add ptr %ptr, i8 %v1 acquire %udiv = udiv i8 %ld0, %v1 + %urem = urem i8 %ld0, %v1 call void @foo() ret void } @@ -1861,6 +1862,18 @@ define void @foo(i8 %v1, ptr %ptr) { for (auto &LLVMI : *LLVMBB1) { auto &I = cast(*Ctx.getValue(&LLVMI)); + // Check isTerminator(). + EXPECT_EQ(LLVMI.isTerminator(), I.isTerminator()); + // Check isUnaryOp(). + EXPECT_EQ(LLVMI.isUnaryOp(), I.isUnaryOp()); + // Check isBinaryOp(). + EXPECT_EQ(LLVMI.isBinaryOp(), I.isBinaryOp()); + // Check isIntDivRem(). + EXPECT_EQ(LLVMI.isIntDivRem(), I.isIntDivRem()); + // Check isShift(). + EXPECT_EQ(LLVMI.isShift(), I.isShift()); + // Check isCast(). + EXPECT_EQ(LLVMI.isCast(), I.isCast()); // Check isAssociative(). EXPECT_EQ(LLVMI.isAssociative(), I.isAssociative()); // Check isCommutative(). diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index f3ca09a6a68ea8..26eec0d4f2082a 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -19,12 +19,18 @@ class XeGPUAttr traits = [], let mnemonic = attrMnemonic; } -def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> { +class XeGPU_TensorDescAttr traits = [], + string baseCppClass = "::mlir::Attribute"> + : XeGPUAttr { + let assemblyFormat = "`<` struct(params) `>`"; +} + +def XeGPU_BlockTensorDescAttr: XeGPU_TensorDescAttr<"BlockTensorDesc", "block_tdesc_attr"> { let summary = [{a composite attribute for `TensorDescType`}]; - let description = [{`TensorDescAttr` (or `tdesc_attr`) is a composite + let description = [{`BlockTensorDesc` (or `block_tdesc_attr`) is a composite attribute defined for `TensorDescType` for describing following properties of a `TensorDesc`. - 1. `memory_scope`: It describes where the data block described by the + 1. `memory_space`: It describes where the data block described by the TensorDesc is located, `Global` device memory or `Shared` local memory. It is default to `Global`. 2. `array_length`: It describes how many horizontally consecutive blocks @@ -33,43 +39,63 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> { 8x32. Its default value is 1. 3. `boundary_check`: It is used to indicates the hardware whether to do out-of-boundary check. The default value is true. - 4. `scattered`: It is used to differenciate TensorDescs created from - `create_nd_tdesc` vs from `create_tdesc`. }]; let parameters = (ins - OptionalParameter<"MemoryScopeAttr">: $memory_scope, + OptionalParameter<"MemorySpaceAttr">: $memory_space, OptionalParameter<"IntegerAttr", "1">: $array_length, - OptionalParameter<"BoolAttr", "true">: $boundary_check, - OptionalParameter<"BoolAttr", "false">: $scattered + OptionalParameter<"BoolAttr", "true">: $boundary_check ); let builders = [ AttrBuilder<(ins - CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope, + CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space, CArg<"int", "1">:$array_length, - CArg<"bool", "true">: $boundary_check, - CArg<"bool", "false">: $scattered + CArg<"bool", "true">: $boundary_check )> ]; - let assemblyFormat = "`<` struct(params) `>`"; } +def XeGPU_ScatterTensorDescAttr: XeGPU_TensorDescAttr<"ScatterTensorDesc", "scatter_tdesc_attr"> { + let summary = [{a composite attribute for `TensorDescType`}]; + let description = [{`ScatterTensorDesc` (or `scatter_tdesc_attr`) is a composite + attribute defined for `TensorDescType` for describing following + properties of a `TensorDesc`. + 1. `memory_space`: It describes where the data block described by the + TensorDesc is located, `Global` device memory or `Shared` local memory. + It is default to `Global`. + 2. `chunk_size`: indicates number of continious elements accessed for each + offset, default is 1. It is used with `scattered` attr only. + }]; + + let parameters = (ins + OptionalParameter<"MemorySpaceAttr">: $memory_space, + OptionalParameter<"IntegerAttr", "1">: $chunk_size + ); + + let builders = [ + AttrBuilder<(ins + CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space, + CArg<"int", "1">: $chunk_size + )> + ]; + } + //===----------------------------------------------------------------------===// // XeGPU Memory Scope Enums. //===----------------------------------------------------------------------===// -def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">; -def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">; -def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope", +def XeGPU_MemorySpaceGlobal: I32EnumAttrCase<"Global", 0, "global">; +def XeGPU_MemorySpaceShared: I32EnumAttrCase<"SLM", 3, "slm">; +def XeGPU_MemorySpace: I32EnumAttr<"MemorySpace", "The address space of the memory the tensor descritor is created for", - [XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> { + [XeGPU_MemorySpaceGlobal, XeGPU_MemorySpaceShared]> { let genSpecializedAttr = 0; let cppNamespace = "::mlir::xegpu"; } -def XeGPU_MemoryScopeAttr: - EnumAttr { +def XeGPU_MemorySpaceAttr: + EnumAttr { let summary = [{Describe the location of data described by a `TensorDesc`: Global device memory (`Global`) or Shared local memory (`SLM`).}]; let assemblyFormat = "$value"; @@ -116,4 +142,4 @@ def XeGPU_FenceScopeAttr: let assemblyFormat = "$value"; } -#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD \ No newline at end of file +#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index c32c7541c39791..e24a056de2caf3 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -218,6 +218,23 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; } mlir::Value getViewSource() { return getSource(); } + + unsigned getSourceMemorySpace() { + auto srcTy = getSourceType(); + if (auto memrefTy = llvm::dyn_cast(srcTy)) { + auto attr = memrefTy.getMemorySpace(); + if (attr) { + if (auto intAttr = llvm::dyn_cast(attr)) { + return static_cast(intAttr.getInt()); + } + if (auto memSpaceAttr = llvm::dyn_cast(attr)) + return static_cast(memSpaceAttr.getValue()); + } + } + // take global as default memory scope. + return static_cast(MemorySpace::Global); + } + }]; } @@ -411,8 +428,10 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { is fixed to the hardware supportted subgroup size, e.g., 16 on PVC, implying each element in the array corresponds to a work-item (SIMT lane) in the subgroup. - * chunk_size: [optional attribute] indicates number of continious - elements accessed for each offset, default is 1. + + The first dimension of the result TensorDesc corresponds to work-items, so it should + match the dimension of offsets. It may also has a second dimension corresponding to + the chunk_size if the chunk size is larger than 1. Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64] ```mlir @@ -424,29 +443,22 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71] ```mlir %0 = memref.alloc() : memref<1024xf32> - %1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32> + %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8> ``` Example 3. It is similar to Example 2, but there is some overlaps among workitems. It accesses: a[0:7], a[4:11], a[8:15], a[12:19] ```mlir %0 = memref.alloc() : memref<1024xf32> - %1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32> + %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>> ``` }]; let arguments = (ins XeGPU_BaseAddrType: $source, Variadic: $offsets, - DenseI64ArrayAttr: $const_offsets, - DefaultValuedAttr: $chunk_size); + DenseI64ArrayAttr: $const_offsets); let results = (outs XeGPU_TensorDesc:$TensorDesc); - let builders = [ - OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source, - "llvm::ArrayRef": $offsets, - CArg<"uint32_t", "1"> : $chunk_size)>, - ]; - let assemblyFormat = [{ $source custom($offsets, $const_offsets) @@ -473,6 +485,22 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { assert(idx < getNumOffsets() && "Invalid out of bound access."); return getMixedOffsets()[idx]; } + + unsigned getSourceMemorySpace() { + auto srcTy = getSource().getType(); + if (auto memrefTy = llvm::dyn_cast(srcTy)) { + auto attr = memrefTy.getMemorySpace(); + if (attr) { + if (auto intAttr = llvm::dyn_cast(attr)) + return static_cast(intAttr.getInt()); + if (auto memSpaceAttr = llvm::dyn_cast(attr)) + return static_cast(memSpaceAttr.getValue()); + } + } + // take global as default memory scope. + return static_cast(MemorySpace::Global); + } + }]; let hasVerifier = 1; @@ -520,28 +548,31 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"] let description = [{ It (aka. load) load data per each work-item. The output describes the data being loaded at the subgroup level, so its size is - consistent with the number of work-items in a subgroup. When `chunk_size_per_lane` - attribute is larger than 1 in TensorDesc, the output vector will be 2D vector, - with dim-1 correspoding to the chunk size. + consistent with the number of work-items in a subgroup. When the chunk size + is larger than 2, the output vector is a 2D vector, with dim-1 correspoding + to work-items, and dim-0 corresponding to the chunk_size loaded by each work-item. + Specially, there is a transpose effect on the result (as compared to the TensorDesc) + due to the hardware implementation. Therefore, a transpose attribute is introduced + on purpose, making sure users are aware of this implicit transformation. The mask operand masks out memory access so that it is safe to pass out-of-boundary addresses/offsets as long as they are masked. It applies to slots of SIMD lanes. Example: ```mlir - %2 = xegpu.load %1, %0 {transpose = [1, 0], + %2 = xegpu.load %1, %0 {transpose, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} - : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> - -> vector<16xf32> + : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, + vector<16xi1> -> vector<16xf32> ``` }]; let arguments = (ins XeGPU_TensorDesc: $TensorDesc, XeGPU_MaskType: $mask, - OptionalAttr: $transpose, + OptionalAttr: $transpose, OptionalAttr: $l1_hint, OptionalAttr: $l2_hint, OptionalAttr: $l3_hint); @@ -573,11 +604,15 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"] let hasVerifier = 1; } -def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDesc"]>, - AllElementTypesMatch<["value", "TensorDesc"]>]> { +def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementCountsMatch<["value", "TensorDesc"]>, + AllElementTypesMatch<["value", "TensorDesc"]>]> { let summary = "store data to scattered memory locations."; - let description = [{ It (aka. store) stores data to scattered memory locations. - It has similar semantic to `load_gather`. + let description = [{ It (aka. store) stores data to scattered memory locations. The value is + typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be + a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes + and the dim-0 of the value corresponds to the chunk_size stored per lane. So `store_scatter` + has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is + introduced on purpose, making sure users are aware of this implicit transformation. Example: ```mlir @@ -592,6 +627,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDe XeGPU_ValueType: $value, XeGPU_TensorDesc: $TensorDesc, XeGPU_MaskType: $mask, + OptionalAttr: $transpose, OptionalAttr: $l1_hint, OptionalAttr: $l2_hint, OptionalAttr: $l3_hint); @@ -723,7 +759,7 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>] def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure, AllElementTypesMatch<["tensorDesc", "value", "result"]>, - AllShapesMatch<["tensorDesc", "mask", "value", "result"]>]> { + AllShapesMatch<["tensorDesc", "value", "result"]>]> { let summary = "Atomic ready-modify-write operation on the TensorDesc. "; let description = [{ @@ -808,7 +844,7 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> { 2. `Fence_scope` describes the scope of fence. "Workgroup" means that the scope would be within each workgroup. "GPU" means the scope would be across workgroups within the GPU. }]; - let arguments = (ins XeGPU_MemoryScopeAttr: $memory_kind, + let arguments = (ins XeGPU_MemorySpaceAttr: $memory_kind, XeGPU_FenceScopeAttr: $fence_scope); let assemblyFormat = [{`memory_kind` `=` `` $memory_kind `,` `fence_scope` `=` `` $fence_scope attr-dict}]; let extraClassDeclaration = extraBaseClassDeclaration; diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index 9f101a71697b56..0ce1211664b5ba 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -48,7 +48,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", Similar to the builtin tensor, it also provides an optinal attribute to encoding the following information via the TensorDescAttr object: - * memory_scope (xegpu::MemoryScope): [optional] where the data is located, + * memory_space (xegpu::MemorySpace): [optional] where the data is located, global memory or shared memory. It is default to Global. * array_length (int): [optional] The number of contiguous blocks with size as `shape`, that will be loaded by block load at a time. It is default to 1. @@ -63,7 +63,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", element-type ::= float-type | integer-type | index-type dim-list := (static-dim-list `x`)? static-dim-list ::= decimal-literal `x` decimal-literal - attr-list = (, memory_scope = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? + attr-list = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? ``` Examples: @@ -76,7 +76,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", xegpu.tensor_desc<8x16xf32> // A TensorDesc with 8x16 f32 elements for a memory region in shared memory space. - xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> ``` }]; @@ -88,11 +88,14 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", TypeBuilderWithInferredContext<(ins "llvm::ArrayRef": $shape, "mlir::Type": $elementType, - CArg<"bool", "false">: $scattered, CArg<"int", "1">: $array_length, - CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope, - CArg<"bool", "true">: $boundary_check - )> + CArg<"bool", "true">: $boundary_check, + CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space)>, + TypeBuilderWithInferredContext<(ins + "llvm::ArrayRef": $shape, + "mlir::Type": $elementType, + CArg<"int", "1">: $chunk_size, + CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space)> ]; let extraClassDeclaration = [{ @@ -110,40 +113,58 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", return llvm::cast(cloneWith(getShape(), elementType)); } - TensorDescAttr getEncodingAsTensorDescAttr() const { - return llvm::dyn_cast_if_present(getEncoding()); + BlockTensorDescAttr getEncodingAsBlockTensorDescAttr() const { + return llvm::dyn_cast_if_present(getEncoding()); } - xegpu::MemoryScope getMemoryScope() const { - auto attr = getEncodingAsTensorDescAttr(); - if (attr && attr.getMemoryScope()) - return attr.getMemoryScope().getValue(); + ScatterTensorDescAttr getEncodingAsScatterTensorDescAttr() const { + return llvm::dyn_cast_if_present(getEncoding()); + } + + xegpu::MemorySpace getMemorySpace() const { + auto block_attr = getEncodingAsBlockTensorDescAttr(); + if (block_attr && block_attr.getMemorySpace()) + return block_attr.getMemorySpace().getValue(); + + auto scatter_attr = getEncodingAsScatterTensorDescAttr(); + if (scatter_attr && scatter_attr.getMemorySpace()) + return scatter_attr.getMemorySpace().getValue(); + // return default value - return MemoryScope::Global; + return MemorySpace::Global; } int getArrayLength() { - auto attr = getEncodingAsTensorDescAttr(); - if (attr && attr.getArrayLength()) - return attr.getArrayLength().getInt(); + auto attr = getEncoding(); + auto block_attr = mlir::dyn_cast_if_present(attr); + assert((!attr || block_attr) && "invalid on non BlockTensorDescAttr."); + if (block_attr && block_attr.getArrayLength()) + return block_attr.getArrayLength().getInt(); // return default value return 1; } bool getBoundaryCheck() { - auto attr = getEncodingAsTensorDescAttr(); - if (attr && attr.getBoundaryCheck()) - return attr.getBoundaryCheck().getValue(); + auto attr = getEncoding(); + auto block_attr = mlir::dyn_cast_if_present(attr); + assert((!attr || block_attr) && "invalid on non BlockTensorDescAttr."); + if (block_attr && block_attr.getBoundaryCheck()) + return block_attr.getBoundaryCheck().getValue(); // return default value return true; } - bool getScattered() { - auto attr = getEncodingAsTensorDescAttr(); - if (attr && attr.getScattered()) - return attr.getScattered().getValue(); - // return default value - return false; + bool isScattered() { + return bool(getEncodingAsScatterTensorDescAttr()); + } + + int getChunkSize() { + auto attr = getEncoding(); + auto scatter_attr = mlir::dyn_cast_if_present(attr); + assert((!attr || scatter_attr) && "invalid on non ScatterTensorDescAttr."); + if (scatter_attr && scatter_attr.getChunkSize()) + return scatter_attr.getChunkSize().getInt(); + return 1; } }]; diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp index be1581d619a8b1..fa034427655394 100644 --- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp @@ -168,9 +168,8 @@ struct TransferReadLowering : public OpRewritePattern { if (isTransposeLoad) std::reverse(descShape.begin(), descShape.end()); auto descType = xegpu::TensorDescType::get( - descShape, elementType, /*scattered=*/false, /*array_length=*/1, - xegpu::MemoryScope::Global, - /*boundary_check=*/isOutOfBounds); + descShape, elementType, /*array_length=*/1, + /*boundary_check=*/isOutOfBounds, xegpu::MemorySpace::Global); xegpu::CreateNdDescOp ndDesc = createNdDescriptor(rewriter, loc, descType, @@ -212,10 +211,10 @@ struct TransferWriteLowering return rewriter.notifyMatchFailure(writeOp, "Expects identity map"); VectorType vecTy = writeOp.getVectorType(); - auto descType = xegpu::TensorDescType::get( - vecTy.getShape(), vecTy.getElementType(), - /*scattered=*/false, /*array_length=*/1, xegpu::MemoryScope::Global, - /*boundary_check=*/false); + auto descType = + xegpu::TensorDescType::get(vecTy.getShape(), vecTy.getElementType(), + /*array_length=*/1, /*boundary_check=*/false, + xegpu::MemorySpace::Global); xegpu::CreateNdDescOp ndDesc = createNdDescriptor( rewriter, loc, descType, dyn_cast>(writeOp.getSource()), diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 6800a0fec278c6..fa20001f661822 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -810,27 +810,35 @@ static Value calculateGatherOffset(RewriterBase &rewriter, enum VectorMemoryAccessKind { ScalarBroadcast, Contiguous, Gather }; -/// Find the non-unit dim in a linalgOp. -/// When executing this hook, it is expected that only one dim will be non-unit. -/// Other cases (i.e. reading n-D vectors) should've been labelled as gather -/// loads before calling this method. This is used for finding contiguous loads -/// (represented as `tensor.extract`) within `linalg.generic` Ops. Note that -/// this condition is expected to hold for statically shaped Linalg Ops only. -static uint64_t getNonUnitLoopDim(LinalgOp linalgOp) { - uint64_t nonUnitDim = 0; - uint64_t countNonUnitDim = 0; - for (auto tripCount : llvm::enumerate(linalgOp.getStaticLoopRanges())) { - if (tripCount.value() != 1) { - nonUnitDim = tripCount.index(); - countNonUnitDim++; - } - } - +/// Find the index of the trailing non-unit dim in linalgOp. This hook is used +/// when checking whether `tensor.extract` Op (within a `linalg.generic` Op) +/// represents a contiguous load operation. +/// +/// Note that when calling this hook, it is assumed that the output vector is +/// effectively 1D. Other cases (i.e. reading n-D vectors) should've been +/// labelled as a gather load before entering this method. +/// +/// Following on from the above, it is assumed that: +/// * for statically shaped loops, when no masks are used, only one dim is != +/// 1 (that's what the shape of the output vector is based on). +/// * for dynamically shaped loops, there might be more non-unit dims +/// as the output vector type is user-specified. +/// +/// TODO: Statically shaped loops + vector masking +static uint64_t getTrailingNonUnitLoopDimIdx(LinalgOp linalgOp) { + SmallVector loopRanges = linalgOp.getStaticLoopRanges(); assert(linalgOp.hasDynamicShape() || - countNonUnitDim == 1 && "For statically shaped Linalg Ops, only one " - "non-unit loop dim is expected"); - (void)countNonUnitDim; - return nonUnitDim; + llvm::count_if(loopRanges, [](int64_t dim) { return dim != 1; }) == + 1 && + "For statically shaped Linalg Ops, only one " + "non-unit loop dim is expected"); + + size_t idx = loopRanges.size() - 1; + for (; idx >= 0; idx--) + if (loopRanges[idx] != 1) + break; + + return idx; } /// Checks whether `val` can be used for calculating a loop invariant index. @@ -854,11 +862,11 @@ static bool isLoopInvariantIdx(LinalgOp &linalgOp, Value &val, assert(defOp && "This is neither a block argument nor an operation result"); // IndexOp is loop invariant as long as its result remains constant across - // iterations. Given the assumptions on the loop ranges above, only the - // trailing loop dim ever changes. - auto trailingLoopDim = linalgOp.getStaticLoopRanges().size() - 1; - if (auto indexOp = dyn_cast(defOp)) - return (indexOp.getDim() != trailingLoopDim); + // iterations. Note that for dynamic shapes, the corresponding dim will also + // be conservatively treated as != 1. + if (auto indexOp = dyn_cast(defOp)) { + return linalgOp.getStaticLoopRanges()[indexOp.getDim()] == 1; + } auto *ancestor = block->findAncestorOpInBlock(*defOp); @@ -877,7 +885,7 @@ static bool isLoopInvariantIdx(LinalgOp &linalgOp, Value &val, return result; } -/// Check whether \p val could be used for calculating the trailing index for a +/// Check whether `val` could be used for calculating the trailing index for a /// contiguous load operation. /// /// There are currently 3 types of values that are allowed here: @@ -886,13 +894,14 @@ static bool isLoopInvariantIdx(LinalgOp &linalgOp, Value &val, /// 3. results of basic arithmetic operations (linear and continuous) /// involving 1., 2. and 3. /// This method returns True if indeed only such values are used in calculating -/// \p val. +/// `val.` /// /// Additionally, the trailing index for a contiguous load operation should /// increment by 1 with every loop iteration, i.e. be based on: /// * `linalg.index ` , -/// where is the trailing dim of the iteration space. \p foundIndexOp is -/// updated to `true` when such an op is found. +/// where is the trailing non-unit dim of the iteration space (this way, +/// `linalg.index ` increments by 1 with every loop iteration). +/// `foundIndexOp` is updated to `true` when such Op is found. static bool isContiguousLoadIdx(LinalgOp &linalgOp, Value &val, bool &foundIndexOp, VectorType resType) { @@ -912,12 +921,10 @@ static bool isContiguousLoadIdx(LinalgOp &linalgOp, Value &val, Operation *defOp = val.getDefiningOp(); assert(defOp && "This is neither a block argument nor an operation result"); - // Given the assumption on the loop ranges above, we expect only 1 non-unit - // loop dim. - auto nonUnitLoopDim = getNonUnitLoopDim(linalgOp); - if (auto indexOp = dyn_cast(defOp)) { - foundIndexOp = (indexOp.getDim() == nonUnitLoopDim); + auto loopDimThatIncrementsByOne = getTrailingNonUnitLoopDimIdx(linalgOp); + + foundIndexOp = (indexOp.getDim() == loopDimThatIncrementsByOne); return true; } @@ -1012,7 +1019,10 @@ getTensorExtractMemoryAccessPattern(tensor::ExtractOp extractOp, bool foundIndexOp = false; bool isContiguousLoad = isContiguousLoadIdx(linalgOp, extractOpTrailingIdx, foundIndexOp, resType); - isContiguousLoad &= foundIndexOp; + // TODO: Support generating contiguous loads for column vectors - that will + // require adding a permutation map to tranfer_read Ops. + bool isRowVector = resType.getShape().back() != 1; + isContiguousLoad &= (foundIndexOp && isRowVector); if (isContiguousLoad) { LDBG("Found contigous load: " << extractOp); @@ -1073,6 +1083,11 @@ vectorizeTensorExtract(RewriterBase &rewriter, VectorizationState &state, // b. contiguous loads. // Both cases use vector.transfer_read. + assert(llvm::count_if(resultType.getShape(), + [](uint64_t dim) { return dim != 1; }) && + "Contiguous loads and scalar loads + broadcast only support 1-D " + "vectors ATM!"); + // Collect indices for `vector.transfer_read`. At this point, the indices will // either be scalars or would have been broadcast to vectors matching the // result type. For indices that are vectors, there are two options: @@ -2972,10 +2987,15 @@ struct Conv1DGenerator if (!setOperKind(reduceOp)) return; auto maybeKind = getCombinerOpKind(reduceOp); - if (!maybeKind || (*maybeKind != vector::CombiningKind::ADD && + // Typically convolution will have a `Add` CombiningKind but for i1 type it + // can get strength reduced to `OR` which is also supported. This strength + // reduction logic is in `buildBinaryFn` helper in the Linalg dialect. + if (!maybeKind || ((*maybeKind != vector::CombiningKind::ADD && + *maybeKind != vector::CombiningKind::OR) && (oper != Pool || !isSupportedPoolKind(*maybeKind)))) { return; } + reductionKind = maybeKind.value(); auto rhsRank = rhsShapedType.getRank(); switch (oper) { @@ -3258,10 +3278,12 @@ struct Conv1DGenerator bindDims(ctx, n, w, f, c); lhs = promote(rewriter, loc, lhs, res.getType()); rhs = promote(rewriter, loc, rhs, res.getType()); - return rewriter.create( + auto contrationOp = rewriter.create( loc, lhs, rhs, res, /*indexingMaps=*/MapList{{n, w, c}, {c, f}, {n, w, f}}, /*iteratorTypes=*/ArrayRef{par, par, par, red}); + contrationOp.setKind(reductionKind); + return contrationOp; } // Create an outerproduct: lhs{w} * rhs{1} -> res{w} for single channel @@ -3651,6 +3673,7 @@ struct Conv1DGenerator int strideW, dilationW; Value lhsShaped, rhsShaped, resShaped; ShapedType lhsShapedType, rhsShapedType, resShapedType; + vector::CombiningKind reductionKind; // Sets oper, poolExtOp and isPoolExt for valid conv/pooling ops. // Returns true iff it is a valid conv/pooling op. @@ -3666,7 +3689,9 @@ struct Conv1DGenerator switch (numBlockArguments) { case 1: { // Will be convolution if feeder is a MulOp. - // Otherwise, if it can be pooling. + // A strength reduced version of MulOp for i1 type is AndOp which is also + // supported. Otherwise, it can be pooling. This strength reduction logic + // is in `buildBinaryFn` helper in the Linalg dialect. auto feedValIt = llvm::find_if_not(reduceOp->getOperands(), llvm::IsaPred); Operation *feedOp = (*feedValIt).getDefiningOp(); @@ -3674,7 +3699,9 @@ struct Conv1DGenerator oper = Pool; isPoolExt = true; poolExtOp = feedOp->getName().getIdentifier(); - } else if (!(isa(feedOp) && + } else if (!((isa(feedOp) || + (isa(feedOp) && + feedOp->getResultTypes()[0].isInteger(1))) && llvm::all_of(feedOp->getOperands(), [](Value v) { if (isa(v)) return true; diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 24719fe748fe4f..1dfbaed454c193 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -30,23 +30,35 @@ void XeGPUDialect::initialize() { } //===----------------------------------------------------------------------===// -// XeGPU_TensorDescAttr +// XeGPU_BlockTensorDescAttr //===----------------------------------------------------------------------===// -TensorDescAttr TensorDescAttr::get(mlir::MLIRContext *context, - xegpu::MemoryScope memory_scope, - int array_length, bool boundary_check, - bool scattered) { - auto scopeAttr = MemoryScopeAttr::get(context, memory_scope); +BlockTensorDescAttr BlockTensorDescAttr::get(mlir::MLIRContext *context, + xegpu::MemorySpace memory_space, + int array_length, + bool boundary_check) { + auto scopeAttr = MemorySpaceAttr::get(context, memory_space); auto lengthAttr = IntegerAttr::get(IntegerType::get(context, 64), array_length); auto boundaryAttr = BoolAttr::get(context, boundary_check); - auto scatteredAttr = BoolAttr::get(context, scattered); - return Base::get(context, scopeAttr, lengthAttr, boundaryAttr, scatteredAttr); + return Base::get(context, scopeAttr, lengthAttr, boundaryAttr); +} + +//===----------------------------------------------------------------------===// +// XeGPU_ScatterTensorDescAttr +//===----------------------------------------------------------------------===// +ScatterTensorDescAttr +ScatterTensorDescAttr::get(mlir::MLIRContext *context, + xegpu::MemorySpace memory_space, int chunk_size) { + auto scopeAttr = MemorySpaceAttr::get(context, memory_space); + auto chunkSizeAttr = + IntegerAttr::get(IntegerType::get(context, 64), chunk_size); + return Base::get(context, scopeAttr, chunkSizeAttr); } //===----------------------------------------------------------------------===// // XeGPU_TensorDescType //===----------------------------------------------------------------------===// + mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) { llvm::SmallVector shape; mlir::Type elementType; @@ -108,12 +120,20 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const { } TensorDescType TensorDescType::get(llvm::ArrayRef shape, - mlir::Type elementType, bool scattered, - int array_length, MemoryScope memory_scope, - bool boundary_check) { + mlir::Type elementType, int array_length, + bool boundary_check, + MemorySpace memory_space) { + auto context = elementType.getContext(); + auto attr = BlockTensorDescAttr::get(context, memory_space, array_length, + boundary_check); + return Base::get(context, shape, elementType, attr); +} + +TensorDescType TensorDescType::get(llvm::ArrayRef shape, + mlir::Type elementType, int chunk_size, + MemorySpace memory_space) { auto context = elementType.getContext(); - auto attr = TensorDescAttr::get(context, memory_scope, array_length, - boundary_check, scattered); + auto attr = ScatterTensorDescAttr::get(context, memory_space, chunk_size); return Base::get(context, shape, elementType, attr); } diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 9c517337a3aa57..1a7a6b34784099 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -124,6 +124,17 @@ LogicalResult CreateNdDescOp::verify() { bool invalidRank = false; bool invalidElemTy = false; + // Memory space of created TensorDesc should match with the source. + // Both source and TensorDesc are considered for global memory by default, + // if the memory scope attr is not specified. If source is an integer, + // it is considered as ptr to global memory. + auto srcMemorySpace = getSourceMemorySpace(); + auto tdescMemorySpace = static_cast(getType().getMemorySpace()); + if (srcMemorySpace != tdescMemorySpace) + return emitOpError("Memory space mismatch.") + << " Source: " << srcMemorySpace + << ", TensorDesc: " << tdescMemorySpace; + // check source type matches the rank if it is a memref. // It also should have the same ElementType as TensorDesc. auto memrefTy = dyn_cast(getSourceType()); @@ -152,9 +163,13 @@ LogicalResult CreateNdDescOp::verify() { return emitOpError("TensorDesc should have the same element " "type with the source if it is a memref.\n"); - if (getType().getScattered()) + if (getType().isScattered()) return emitOpError("Expects a non-scattered TensorDesc.\n"); + if (getType().getRank() == 2 && + tdescMemorySpace == static_cast(MemorySpace::SLM)) + return emitOpError("SLM is not supported for 2D Block TensorDesc.\n"); + return success(); } @@ -163,7 +178,7 @@ LogicalResult CreateNdDescOp::verify() { //===----------------------------------------------------------------------===// LogicalResult PrefetchNdOp::verify() { auto tdescTy = getTensorDescType(); - if (tdescTy.getScattered()) + if (tdescTy.isScattered()) return emitOpError("Expects a non-scattered TensorDesc.\n"); if (!isReadHintOrNone(getL1HintAttr())) @@ -188,7 +203,7 @@ LogicalResult LoadNdOp::verify() { if (tdescTy.getRank() > 2) return emitOpError("Expecting a 1D/2D TensorDesc.\n"); - if (tdescTy.getScattered()) + if (tdescTy.isScattered()) return emitOpError("Expects a non-scattered TensorDesc.\n"); if (!valueTy) @@ -228,8 +243,8 @@ LogicalResult LoadNdOp::verify() { tdescShape[axis] /= vnni_factor; tdescShape.push_back(vnni_factor); } else { - return emitWarning("Invalid Packed Attr. It is ignored (available for 2D " - "TensorDesc only)."); + emitWarning("Invalid Packed Attr. It is ignored (available for 2D " + "TensorDesc only)."); } } @@ -256,7 +271,7 @@ LogicalResult StoreNdOp::verify() { if (dstTy.getRank() > 2) return emitOpError("Expecting a 1D/2D TensorDesc.\n"); - if (dstTy.getScattered()) + if (dstTy.isScattered()) return emitOpError("Expects a non-scattered TensorDesc.\n"); if (!valTy) @@ -279,7 +294,7 @@ LogicalResult StoreNdOp::verify() { //===----------------------------------------------------------------------===// LogicalResult UpdateNdOffsetOp::verify() { auto ty = getTensorDescType(); - if (ty.getScattered()) + if (ty.isScattered()) return emitOpError("Expects a non-scattered TensorDesc.\n"); // number of offsets specified must match the rank of the tensor descriptor @@ -292,28 +307,55 @@ LogicalResult UpdateNdOffsetOp::verify() { //===----------------------------------------------------------------------===// // XeGPU_CreateDescOp //===----------------------------------------------------------------------===// -void CreateDescOp::build(OpBuilder &builder, OperationState &state, - TensorDescType TensorDesc, Value source, - llvm::ArrayRef offsets, - uint32_t chunk_size) { - llvm::SmallVector staticOffsets; - llvm::SmallVector dynamicOffsets; - dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); - build(builder, state, TensorDesc, source, dynamicOffsets, staticOffsets, - chunk_size); -} LogicalResult CreateDescOp::verify() { auto tdescTy = getTensorDescType(); - auto chunkSize = getChunkSize(); if (getRankOf(getSource()) > 1) return emitOpError( "Expecting the source is a 1D memref or pointer (uint64_t)."); - if (!tdescTy.getScattered()) + if (!tdescTy.isScattered()) return emitOpError("Expects a scattered TensorDesc.\n"); + // Memory space of created TensorDesc should match with the source. + // Both source and TensorDesc are considered for global memory by default, + // if the memory scope attr is not specified. If source is an integer, + // it is considered as ptr to global memory. + auto srcMemorySpace = getSourceMemorySpace(); + auto tdescMemorySpace = static_cast(tdescTy.getMemorySpace()); + if (srcMemorySpace != tdescMemorySpace) + return emitOpError("Memory space mismatch.") + << " Source: " << srcMemorySpace + << ", TensorDesc: " << tdescMemorySpace; + + auto chunkSize = tdescTy.getChunkSize(); + + // check chunk_size + llvm::SmallVector supportedChunkSizes = {1, 2, 3, 4, 8, + 16, 32, 64, 128, 256}; + if (!llvm::is_contained(supportedChunkSizes, chunkSize)) + return emitOpError("Invalid chunk_size. Supported values are 1, 2, 3, 4, " + "8, 16, 32, 64, 128, or 256."); + + // check total size + auto elemBits = tdescTy.getElementType().getIntOrFloatBitWidth(); + auto bitsPerLane = elemBits * chunkSize; + if (chunkSize > 1 && bitsPerLane % 32) { + // For 8-bit and 16-bit data, the hardware only supports chunk size of 1. + // For 32-bit data, the hardware can support larger larger chunk size. So + // we can bitcast 8-bit/16-bit data to 32-bit data for better performance. + // But this requires the total size is 32 bit aligned to make the + // optimization work. + return emitOpError( + "access size (chunk_size * sizeof(elemTy)) should be 32-bit aligned."); + } + + auto lscConstraints = 512 * 8; // each access is upto 512 bytes. + if (elemBits * tdescTy.getNumElements() > lscConstraints) + return emitOpError("total access size (simd_lanes * chunk_size * " + "sizeof(elemTy)) is upto 512 bytes."); + SmallVector shape({(int64_t)getNumOffsets()}); if (chunkSize != 1) shape.push_back(chunkSize); @@ -331,7 +373,7 @@ LogicalResult CreateDescOp::verify() { //===----------------------------------------------------------------------===// LogicalResult PrefetchOp::verify() { auto tdescTy = getTensorDescType(); - if (!tdescTy.getScattered()) + if (!tdescTy.isScattered()) return emitOpError("Expects a scattered TensorDesc.\n"); if (!isReadHintOrNone(getL1HintAttr())) @@ -354,7 +396,7 @@ LogicalResult LoadGatherOp::verify() { auto maskTy = getMaskType(); auto valueTy = getValueType(); - if (!tdescTy.getScattered()) + if (!tdescTy.isScattered()) return emitOpError("Expects a scattered TensorDesc.\n"); if (!isReadHintOrNone(getL1HintAttr())) @@ -379,12 +421,10 @@ LogicalResult LoadGatherOp::verify() { if (tdescShape[0] != maskShape[0]) return emitOpError("dim-0 of the Mask and TensorDesc should be the same."); - if (getTransposeAttr()) { - auto trans = getTranspose().value(); - if (tdescShape.size() < trans.size()) - emitWarning("Invalid transpose attr. It is ignored."); - else - transpose(trans, tdescShape); + if (tdescTy.getRank() == 2) { + if (!getTransposeAttr()) + return emitOpError("load_gather has to be transposed."); + transpose({1, 0}, tdescShape); } if (valueShape != tdescShape) @@ -400,7 +440,7 @@ LogicalResult LoadGatherOp::verify() { //===----------------------------------------------------------------------===// LogicalResult StoreScatterOp::verify() { auto tdescTy = getTensorDescType(); - if (!tdescTy.getScattered()) + if (!tdescTy.isScattered()) return emitOpError("Expects a scattered TensorDesc.\n"); if (!isWriteHintOrNone(getL1HintAttr())) @@ -413,11 +453,24 @@ LogicalResult StoreScatterOp::verify() { return emitOpError("invlid l3_hint: ") << getL3HintAttr(); auto maskTy = getMaskType(); + auto valueTy = getValueType(); auto maskShape = getShapeOf(maskTy); auto tdescShape = getShapeOf(tdescTy); + auto valueShape = getShapeOf(valueTy); if (tdescShape[0] != maskShape[0]) return emitOpError("dim-0 of the Mask and TensorDesc should be the same."); + if (tdescTy.getRank() == 2) { + if (!getTransposeAttr()) + return emitOpError("load_gather has to be transposed."); + transpose({1, 0}, tdescShape); + } + + if (valueShape != tdescShape) + return emitOpError("Unexpected value shape") + << "(Expected shape: " << makeString(tdescShape) + << ", Given shape: " << makeString(valueShape) << ").\n"; + return success(); } //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/Linalg/vectorize-convolution.mlir b/mlir/test/Dialect/Linalg/vectorize-convolution.mlir index 93e36a69567bd5..7f4b9b986c81b4 100644 --- a/mlir/test/Dialect/Linalg/vectorize-convolution.mlir +++ b/mlir/test/Dialect/Linalg/vectorize-convolution.mlir @@ -39,6 +39,7 @@ func.func @conv1d_nwc_4x2x8_memref(%input: memref<4x6x3xf32>, %filter: memref<1x // CHECK: %[[CONTRACT_0:.+]] = vector.contract { // CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]], // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"] +// CHECK-SAME: kind = #vector.kind // CHECK-SAME: %[[V_INPUT_0]], %[[V_FILTER]], %[[V_OUTPUT_0]] // CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32> @@ -46,6 +47,7 @@ func.func @conv1d_nwc_4x2x8_memref(%input: memref<4x6x3xf32>, %filter: memref<1x // CHECK: %[[CONTRACT_1:.+]] = vector.contract { // CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]], // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"] +// CHECK-SAME: kind = #vector.kind // CHECK-SAME: %[[V_INPUT_1]], %[[V_FILTER]], %[[V_OUTPUT_1]] // CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32> @@ -61,6 +63,36 @@ func.func @conv1d_nwc_4x2x8_memref(%input: memref<4x6x3xf32>, %filter: memref<1x // ----- +// This test is same as above but for i1 type with the only difference being that +// the combining kind for `vector.contract` is `OR`. +func.func @conv1d_nwc_4x2x8_memref_i1(%input: memref<4x6x3xi1>, %filter: memref<1x3x8xi1>, %output: memref<4x2x8xi1>) { + linalg.conv_1d_nwc_wcf + {dilations = dense<1> : tensor<1xi64>, strides = dense<3> : tensor<1xi64>} + ins(%input, %filter : memref<4x6x3xi1>, memref<1x3x8xi1>) + outs(%output : memref<4x2x8xi1>) + return +} +// CHECK: #[[INPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +// CHECK: #[[FILTER_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)> +// CHECK: #[[OUTPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> + +// CHECK: func @conv1d_nwc_4x2x8_memref_i1 +/// w == 0, kw == 0 +// CHECK: %[[CONTRACT_0:.+]] = vector.contract { +// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]], +// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"] +// CHECK-SAME: kind = #vector.kind +// CHECK-SAME: : vector<4x1x3xi1>, vector<3x8xi1> into vector<4x1x8xi1> + +/// w == 1, kw == 0 +// CHECK: %[[CONTRACT_1:.+]] = vector.contract { +// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]], +// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"] +// CHECK-SAME: kind = #vector.kind +// CHECK-SAME: : vector<4x1x3xi1>, vector<3x8xi1> into vector<4x1x8xi1> + +// ----- + // The i8i8i32 case is similar to f32 case, so checking one case is enough for // test coverage. func.func @conv1d_nwc_4x2x8_i8i8i32_memref(%input: memref<4x6x3xi8>, %filter: memref<1x3x8xi8>, %output: memref<4x2x8xi32>) { @@ -299,6 +331,7 @@ func.func @conv1d_ncw_4x8x2_memref(%input: memref<4x3x6xf32>, %filter: memref<8x // CHECK: %[[CONTRACT_0:.+]] = vector.contract { // CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]], // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"] +// CHECK-SAME: kind = #vector.kind // CHECK-SAME: %[[V_INPUT_0]], %[[V_FILTER]], %[[V_OUTPUT_0]] // CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32> @@ -306,6 +339,7 @@ func.func @conv1d_ncw_4x8x2_memref(%input: memref<4x3x6xf32>, %filter: memref<8x // CHECK: %[[CONTRACT_1:.+]] = vector.contract { // CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]], // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"] +// CHECK-SAME: kind = #vector.kind // CHECK-SAME: %[[V_INPUT_1]], %[[V_FILTER]], %[[V_OUTPUT_1]] // CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32> @@ -324,6 +358,37 @@ func.func @conv1d_ncw_4x8x2_memref(%input: memref<4x3x6xf32>, %filter: memref<8x // ----- +// This test is same as above but for i1 type with the only difference being that +// the combining kind for `vector.contract` is `OR`. +func.func @conv1d_ncw_4x8x2_memref_i1(%input: memref<4x3x6xi1>, %filter: memref<8x3x1xi1>, %output: memref<4x8x2xi1>) { + linalg.conv_1d_ncw_fcw + {dilations = dense<1> : tensor<1xi64>, strides = dense<3> : tensor<1xi64>} + ins(%input, %filter : memref<4x3x6xi1>, memref<8x3x1xi1>) + outs(%output : memref<4x8x2xi1>) + return +} + +// CHECK: #[[INPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +// CHECK: #[[FILTER_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)> +// CHECK: #[[OUTPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> + +// CHECK: func @conv1d_ncw_4x8x2_memref_i1 +/// w == 0, kw == 0 +// CHECK: vector.contract { +// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]], +// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"] +// CHECK-SAME: kind = #vector.kind +// CHECK-SAME: : vector<4x1x3xi1>, vector<3x8xi1> into vector<4x1x8xi1> + +/// w == 1, kw == 0 +// CHECK: vector.contract { +// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]], +// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"] +// CHECK-SAME: kind = #vector.kind +// CHECK-SAME: : vector<4x1x3xi1>, vector<3x8xi1> into vector<4x1x8xi1> + +// ----- + func.func @conv1d_ncw_4x8x2_memref(%input: memref<4x3x6xf32>, %filter: memref<8x3x2xf32>, %output: memref<4x8x2xf32>) { linalg.conv_1d_ncw_fcw {dilations = dense<2> : tensor<1xi64>, strides = dense<3> : tensor<1xi64>} diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir index ad3a8d9f926082..2c56b7139fec49 100644 --- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir +++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir @@ -307,6 +307,96 @@ module attributes {transform.with_named_sequence} { // ----- +// Reading a 1D column vector (hence a candidate for a contiguous load), but given +// %1, it's a gather load. + +#map = affine_map<(d0, d1) -> (d0, d1)> +func.func @index_from_output_column_vector_gather_load(%src: tensor<8x128xf32>) -> tensor<8x1xf32> { + %c0 = arith.constant 0 : index + %0 = tensor.empty() : tensor<8x1xf32> + %res = linalg.generic { + indexing_maps = [#map], + iterator_types = ["parallel", "parallel"] + } outs(%0 : tensor<8x1xf32>) { + ^bb0(%arg1: f32): + %1 = linalg.index 0 : index + %extracted = tensor.extract %src[%1, %c0] : tensor<8x128xf32> + linalg.yield %extracted : f32 + } -> tensor<8x1xf32> + return %res : tensor<8x1xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg2: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.generic"]} in %arg2 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 {vectorize_nd_extract} : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// CHECK-LABEL: func.func @index_from_output_column_vector_gather_load( +// CHECK-SAME: %[[SRC:.*]]: tensor<8x128xf32>) -> tensor<8x1xf32> { +// CHECK: %[[C128:.*]] = arith.constant dense<128> : vector<1x8xindex> +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[PASS_THRU:.*]] = arith.constant dense<0.000000e+00> : vector<8x1xf32> +// CHECK: %[[MASK:.*]] = arith.constant dense : vector<8x1xi1> +// CHECK: %[[IDX_VEC:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex> +// CHECK: %[[OUT:.*]] = tensor.empty() : tensor<8x1xf32> +// CHECK: %[[B:.*]] = vector.broadcast %[[IDX_VEC]] : vector<8xindex> to vector<1x8xindex> +// CHECK: %[[MUL:.*]] = arith.muli %[[B]], %[[C128]] : vector<1x8xindex> +// CHECK: %[[TR:.*]] = vector.transpose %[[MUL]], [1, 0] : vector<1x8xindex> to vector<8x1xindex> +// CHECK: %[[GATHER:.*]] = vector.gather %[[SRC]]{{\[}}%[[C0]], %[[C0]]] {{\[}}%[[TR]]], %[[MASK]], %[[PASS_THRU]] : tensor<8x128xf32>, vector<8x1xindex>, vector<8x1xi1>, vector<8x1xf32> into vector<8x1xf32> +// CHECK: %[[RES:.*]] = vector.transfer_write %[[GATHER]], %[[OUT]]{{\[}}%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<8x1xf32>, tensor<8x1xf32> +// CHECK: return %[[RES]] : tensor<8x1xf32> + +// ----- + +// Same as above, but the access indices have been swapped and hence this _is_ +// a contiguous load. Currently not supported and lowered as vector.gather +// instead. +// TODO: Make sure that this is lowered as a contiguous load. + +#map = affine_map<(d0, d1) -> (d0, d1)> +func.func @index_from_output_column_vector_contiguous_load(%src: tensor<8x128xf32>) -> tensor<8x1xf32> { + %c0 = arith.constant 0 : index + %0 = tensor.empty() : tensor<8x1xf32> + %res = linalg.generic { + indexing_maps = [#map], + iterator_types = ["parallel", "parallel"] + } outs(%0 : tensor<8x1xf32>) { + ^bb0(%arg1: f32): + %1 = linalg.index 0 : index + %extracted = tensor.extract %src[%c0, %1] : tensor<8x128xf32> + linalg.yield %extracted : f32 + } -> tensor<8x1xf32> + return %res : tensor<8x1xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg2: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.generic"]} in %arg2 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.vectorize_children_and_apply_patterns %1 {vectorize_nd_extract} : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// CHECK-LABEL: func.func @index_from_output_column_vector_contiguous_load( +// CHECK-SAME: %[[SRC:.*]]: tensor<8x128xf32>) -> tensor<8x1xf32> { +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[PASS_THRU:.*]] = arith.constant dense<0.000000e+00> : vector<8x1xf32> +// CHECK: %[[MASK:.*]] = arith.constant dense : vector<8x1xi1> +// CHECK: %[[IDX_VEC:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex> +// CHECK: %[[OUT:.*]] = tensor.empty() : tensor<8x1xf32> +// CHECK: %[[B:.*]] = vector.broadcast %[[IDX_VEC]] : vector<8xindex> to vector<1x8xindex> +// CHECK: %[[TR:.*]] = vector.transpose %[[B]], [1, 0] : vector<1x8xindex> to vector<8x1xindex> +// CHECK: %[[GATHER:.*]] = vector.gather %[[SRC]]{{\[}}%[[C0]], %[[C0]]] {{\[}}%[[TR]]], %[[MASK]], %[[PASS_THRU]] : tensor<8x128xf32>, vector<8x1xindex>, vector<8x1xi1>, vector<8x1xf32> into vector<8x1xf32> +// CHECK: %[[RES:.*]] = vector.transfer_write %[[GATHER]], %[[OUT]]{{\[}}%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<8x1xf32>, tensor<8x1xf32> +// CHECK: return %[[RES]] : tensor<8x1xf32> + +// ----- + #map = affine_map<(d0) -> (d0)> func.func @vectorize_nd_tensor_extract_contiguous_and_gather(%arg0: tensor<6xf32>, %arg1: tensor<5xi32>) -> tensor<5xf32> { %c5 = arith.constant 5 : index diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir index 35d44cf56a239b..c1126efb6046dc 100644 --- a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir +++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir @@ -24,8 +24,8 @@ gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : ind // CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr> gpu.return } @@ -36,6 +36,13 @@ gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) { gpu.return } +// CHECK: gpu.func @test_create_nd_tdesc_vc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) { +gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) { + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr> + %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr> + gpu.return +} + // CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> @@ -97,17 +104,24 @@ gpu.func @test_create_update_nd_tdesc_vc(%src: memref<24x32xf32>) { // CHECK: gpu.func @test_create_tdesc_vc(%[[arg0:.*]]: ui64) { gpu.func @test_create_tdesc_vc(%src: ui64) { - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> - %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + gpu.return +} + +// CHECK: gpu.func @test_create_tdesc_vc_1(%[[arg0:.*]]: memref) { +gpu.func @test_create_tdesc_vc_1(%src: memref) { + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : memref -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : memref -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> gpu.return } // CHECK: gpu.func @test_prefetch_vc(%[[arg0:.*]]: ui64) { gpu.func @test_prefetch_vc(%src: ui64) { - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> - %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> - // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> - xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> gpu.return } @@ -115,12 +129,12 @@ gpu.func @test_prefetch_vc(%src: ui64) { gpu.func @test_load_gather_vc(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense : vector<4xi1> %0 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> - %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> - //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> - //CHECK-SAME: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr>, vector<4xi1> -> vector<4x2xf32> - %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> - : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr>, vector<4xi1> -> vector<4x2xf32> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> + //CHECK-SAME: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<2x4xf32> + %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> + : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<2x4xf32> gpu.return } @@ -128,23 +142,23 @@ gpu.func @test_load_gather_vc(%src: ui64) { gpu.func @test_store_scatter_vc(%src: ui64) { //CHECK: %[[c0:.*]] = arith.constant dense : vector<4xi1> %0 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[c1:.*]] = arith.constant dense<2.900000e+00> : vector<4x2xf32> - %1 = arith.constant dense<2.9>: vector<4x2xf32> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> - %2 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> - //CHECK: xegpu.store %[[c1]], %[[R0]], %[[c0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> - //CHECK-SAME: vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr>, vector<4xi1> - xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> - : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr>, vector<4xi1> + //CHECK: %[[c1:.*]] = arith.constant dense<2.900000e+00> : vector<2x4xf32> + %1 = arith.constant dense<2.9>: vector<2x4xf32> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %2 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + //CHECK: xegpu.store %[[c1]], %[[R0]], %[[c0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> + //CHECK-SAME: vector<2x4xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> + xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> + : vector<2x4xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> gpu.return } // CHECK: gpu.func @test_create_update_tdesc_vc(%[[arg0:.*]]: ui64) { gpu.func @test_create_update_tdesc_vc(%src: ui64) { - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> - %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> - //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], [32, 32, 32, 32] : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> - %2 = xegpu.update_offset %1, [32, 32, 32, 32] : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src[0, 8, 16, 24]: ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], [32, 32, 32, 32] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %2 = xegpu.update_offset %1, [32, 32, 32, 32] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> gpu.return } @@ -165,10 +179,10 @@ gpu.func @test_dpas_vc_with_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf1 // CHECK: gpu.func @test_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>) gpu.func @test_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) { - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]: ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr> - //CHECK: %[[R1:.*]] = xegpu.atomic_rmw addf %[[R0]], %[[arg2]], %[[arg1]] : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1>, vector<16xf32> -> vector<16xf32> - xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1>, vector<16xf32> -> vector<16xf32> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]: ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + //CHECK: %[[R1:.*]] = xegpu.atomic_rmw addf %[[R0]], %[[arg2]], %[[arg1]] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32> + xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32> gpu.return } diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index 7ef50bb2b5fadf..193dae352e3707 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -15,6 +15,20 @@ func.func @test_create_nd_tdesc_vc_2(%src: memref<24x32xf32>) { return } +// ----- +func.func @test_create_nd_tdesc_vc_3(%src: memref<2x24x32xf32, 3>) { + // expected-error@+1 {{SLM is not supported for 2D Block TensorDesc}} + %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + return +} + +// ----- +func.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32, 3>) { + // expected-error@+1 {{Memory space mismatch}} + %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32> + return +} + // ----- func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> @@ -26,10 +40,10 @@ func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) { // ----- func.func @test_prefetch_nd_vc_2(%src: memref<24xf16>) { %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7] - : memref<24xf16> -> !xegpu.tensor_desc<8xf16, #xegpu.tdesc_attr> + : memref<24xf16> -> !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>> // expected-error@+1 {{Expects a non-scattered TensorDesc}} xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint}> - : !xegpu.tensor_desc<8xf16, #xegpu.tdesc_attr> + : !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>> return } @@ -44,11 +58,11 @@ func.func @test_load_nd_vc_1(%src: memref<8x16xf16>) { // ----- func.func @test_load_nd_vc_2(%src: memref<16xf16>) { - %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2} - : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr> + %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] + : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> // expected-error@+1 {{Expects a non-scattered TensorDesc.}} %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint}> - : !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr> -> vector<8x2xf16> + : !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> -> vector<8x2xf16> return } @@ -73,28 +87,28 @@ func.func @test_store_nd_vc_1(%dst: memref<24x32xf16>) { // ----- func.func @test_store_nd_vc_2(%dst: memref<16xf16>) { %1 = arith.constant dense<1.0>: vector<8x2xf16> - %2 = xegpu.create_tdesc %dst[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2} - : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr> + %2 = xegpu.create_tdesc %dst[0, 2, 4, 6, 8, 10, 12, 14] + : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> // expected-error@+1 {{Expects a non-scattered TensorDesc}} xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint}> - : vector<8x2xf16>, !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr> + : vector<8x2xf16>, !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> return } // ----- func.func @test_update_nd_offset_1(%dst: memref<16xf16>) { - %1 = xegpu.create_tdesc %dst[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2} - : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr> + %1 = xegpu.create_tdesc %dst[0, 2, 4, 6, 8, 10, 12, 14] + : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> // expected-error@+1 {{Expects a non-scattered TensorDesc}} - xegpu.update_nd_offset %1, [0, 2] : !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr> + xegpu.update_nd_offset %1, [0, 2] : !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> return } // ----- func.func @test_create_tdesc_vc_1(%src: ui64) { // expected-error@+1 {{Expects a scattered TensorDesc}} - %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2} - : ui64 -> !xegpu.tensor_desc<8x2xf16> + %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] + : ui64 -> !xegpu.tensor_desc<8xf16> return } @@ -102,7 +116,14 @@ func.func @test_create_tdesc_vc_1(%src: ui64) { func.func @test_create_tdesc_vc_2(%src: ui64) { // expected-error@+1 {{Incorrect TensorDesc shape}} %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2} - : ui64 -> !xegpu.tensor_desc<8x4xf16, #xegpu.tdesc_attr> + : ui64 -> !xegpu.tensor_desc<8x4xf16, #xegpu.scatter_tdesc_attr<>> + return +} + +// ----- +func.func @test_create_tdesc_vc_1(%src: memref) { + // expected-error@+1 {{Memory space mismatch}} + %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : memref -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> return } @@ -116,9 +137,9 @@ func.func @test_prefetch_vc_1(%src: memref<24x32xf16>) { // ----- func.func @test_prefetch_vc_2(%src: ui64) { - %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} - xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> return } @@ -135,11 +156,11 @@ func.func @test_load_gather_vc_1(%src: memref<24x32xf16>) { // ----- func.func @test_load_gather_vc_2(%src: ui64) { %0 = arith.constant dense<1>: vector<4xi1> - %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 - -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64 + -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint}> - : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr>, vector<4xi1> + : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<4x2xf32> return } @@ -159,11 +180,11 @@ func.func @test_store_scatter_vc_1(%src: memref<24x32xf32>) { func.func @test_store_scatter_vc_2(%src: ui64) { %0 = arith.constant dense<1>: vector<4xi1> %1 = arith.constant dense<2.9>: vector<4x2xf32> - %2 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} - : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + %2 = xegpu.create_tdesc %src[0, 8, 16, 24] + : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint}> : vector<4x2xf32>, - !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr>, vector<4xi1> + !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> return } @@ -182,9 +203,9 @@ func.func @test_dpas_vc_2(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) { } // ----- -func.func @test_atomic_rmw(%src: ui64, %value : vector<16x8xf32>, %mask : vector<16xi1>) { - %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] {chunk_size = 8}: ui64 -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr> - // expected-error@+1 {{failed to verify that all of {tensorDesc, mask, value, result} have same shape}} - xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr>, vector<16xi1>, vector<16x8xf32> -> vector<16x8xf32> - gpu.return +func.func @test_atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) { + %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : ui64 -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> + // expected-error@+1 {{failed to verify that all of {tensorDesc, value, result} have same shape}} + xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1>, vector<16x4xf32> -> vector<16x8xf32> + return } \ No newline at end of file diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index a269cf861a5b74..140d48c8f96848 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -725,6 +725,7 @@ libc_support_library( deps = [ ":__support_common", ":__support_cpp_type_traits", + ":__support_fputil_cast", ":__support_fputil_dyadic_float", ":__support_fputil_fenv_impl", ":__support_fputil_fp_bits", @@ -929,6 +930,7 @@ libc_support_library( ":__support_cpp_bit", ":__support_cpp_limits", ":__support_cpp_type_traits", + ":__support_fputil_cast", ":__support_fputil_dyadic_float", ":__support_fputil_fp_bits", ":__support_fputil_nearest_integer_operations", @@ -986,6 +988,7 @@ libc_support_library( ":__support_common", ":__support_cpp_bit", ":__support_cpp_type_traits", + ":__support_fputil_cast", ":__support_fputil_dyadic_float", ":__support_fputil_fenv_impl", ":__support_fputil_fp_bits", @@ -1091,6 +1094,7 @@ libc_support_library( ":__support_fputil_fenv_impl", ":__support_fputil_fp_bits", ":__support_fputil_multiply_add", + ":__support_fputil_rounding_mode", ":__support_macros_optimization", ], ) @@ -1821,26 +1825,11 @@ libc_math_function( ], ) -libc_math_function( - name = "ceil", - specializations = [ - "generic", - ], -) +libc_math_function(name = "ceil") -libc_math_function( - name = "ceilf", - specializations = [ - "generic", - ], -) +libc_math_function(name = "ceilf") -libc_math_function( - name = "ceill", - specializations = [ - "generic", - ], -) +libc_math_function(name = "ceill") libc_math_function(name = "ceilf128") @@ -2122,19 +2111,9 @@ libc_math_function( ], ) -libc_math_function( - name = "floor", - specializations = [ - "generic", - ], -) +libc_math_function(name = "floor") -libc_math_function( - name = "floorf", - specializations = [ - "generic", - ], -) +libc_math_function(name = "floorf") libc_math_function(name = "floorl") @@ -2635,19 +2614,9 @@ libc_math_function(name = "rintl") libc_math_function(name = "rintf128") -libc_math_function( - name = "round", - specializations = [ - "generic", - ], -) +libc_math_function(name = "round") -libc_math_function( - name = "roundf", - specializations = [ - "generic", - ], -) +libc_math_function(name = "roundf") libc_math_function(name = "roundl") @@ -2846,19 +2815,9 @@ libc_math_function(name = "totalordermagl") libc_math_function(name = "totalordermagf128") -libc_math_function( - name = "trunc", - specializations = [ - "generic", - ], -) +libc_math_function(name = "trunc") -libc_math_function( - name = "truncf", - specializations = [ - "generic", - ], -) +libc_math_function(name = "truncf") libc_math_function(name = "truncl") diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl index ec3714407cb914..f298f817af83d7 100644 --- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl +++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl @@ -129,7 +129,6 @@ def libc_function( def libc_math_function( name, - specializations = None, additional_deps = None): """Add a target for a math function. @@ -142,14 +141,6 @@ def libc_math_function( math function. """ additional_deps = additional_deps or [] - specializations = specializations or ["generic"] - select_map = {} - if "generic" in specializations: - select_map["//conditions:default"] = ["src/math/generic/" + name + ".cpp"] - if "aarch64" in specializations: - select_map[PLATFORM_CPU_ARM64] = ["src/math/aarch64/" + name + ".cpp"] - if "x86_64" in specializations: - select_map[PLATFORM_CPU_X86_64] = ["src/math/x86_64/" + name + ".cpp"] #TODO(michaelrj): Fix the floating point dependencies OLD_FPUTIL_DEPS = [ @@ -166,7 +157,7 @@ def libc_math_function( ] libc_function( name = name, - srcs = selects.with_or(select_map), + srcs = ["src/math/generic/" + name + ".cpp"], hdrs = ["src/math/" + name + ".h"], deps = [":__support_common"] + OLD_FPUTIL_DEPS + additional_deps, )