diff --git a/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.cpp b/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.cpp index 56dcd0c8c2d6..7e0fb482989f 100644 --- a/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.cpp +++ b/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.cpp @@ -12,7 +12,10 @@ SPDX-License-Identifier: MIT #include "Compiler/IGCPassSupport.h" #include "Compiler/CISACodeGen/helper.h" #include "common/LLVMWarningsPush.hpp" +#include #include +#include "llvm/IR/Verifier.h" +#include #include "common/LLVMWarningsPop.hpp" #include "GenerateBlockMemOpsPass.hpp" #include "IGCIRBuilder.h" @@ -32,7 +35,7 @@ IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis) IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper) IGC_INITIALIZE_PASS_END(GenerateBlockMemOpsPass, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS) -const uint MaxSgSize = 32; +const size_t MaxSgSize = 32; GenerateBlockMemOpsPass::GenerateBlockMemOpsPass() : FunctionPass(ID) { initializeGenerateBlockMemOpsPassPass(*PassRegistry::getPassRegistry()); @@ -43,34 +46,472 @@ bool GenerateBlockMemOpsPass::runOnFunction(Function &F) { return false; bool Changed = false; - SmallVector LoadStoreToProcess; + + // Load / store instructions which are not in code divergence and can be optimized. + SmallVector LoadStoreToProcess; + // Load / store instructions which are inside the loop and can be optimized. + DenseMap> LoadStoreInLoop; MdUtils = getAnalysis().getMetaDataUtils(); CGCtx = getAnalysis().getCodeGenContext(); + LI = &getAnalysis().getLoopInfo(); + DT = &getAnalysis().getDomTree(); + SE = &getAnalysis().getSE(); + WI = &getAnalysis(); IGCMD::FunctionInfoMetaDataHandle Info = MdUtils->getFunctionsInfoItem(&F); if (Info->getType() != FunctionTypeMD::KernelFunction) return false; + // If the subgroup size is not specified, then the maximum subgroup size is used. + IGC::IGCMD::FunctionInfoMetaDataHandle FuncInfoMD = MdUtils->getFunctionsInfoItem(&F); + IGC::IGCMD::SubGroupSizeMetaDataHandle SubGroupSize = FuncInfoMD->getSubGroupSize(); + if (SubGroupSize->hasValue()) { + SimdSize = SubGroupSize->getSIMDSize(); + } else { + SimdSize = MaxSgSize; + } + // Check that workgroups have been scalarized along the x-axis. if (!checkVectorizationAlongX(&F)) return false; - WI = &getAnalysis(); - // Collect all load / store instructions which can be replaced. - for (auto &B : F) - for (auto &I : B) - if (canOptLoadStore(&I)) + for (auto &B : F) { + for (auto &I : B) { + if (!isa(&I) && !isa(&I)) + continue; + + if (!canOptLoadStore(&I)) + continue; + + // Block read and write instructions must be called by all elements in the subgroup. + if (!WI->insideDivergentCF(&I)) { LoadStoreToProcess.push_back(&I); + } else if (Loop *L = LI->getLoopFor(I.getParent())) { + // In some cases IGC can't proof that there is no code divergence in the loop. + // Handle these cases here. + + // Check that the loop has been already analyzed. + if (LoadStoreInLoop.find(L) == LoadStoreInLoop.end()) { + if (!isLoopPattern(L)) + continue; + + SmallVector Vec; + Vec.push_back(&I); + LoadStoreInLoop.insert(std::make_pair(L, Vec)); + } else { + LoadStoreInLoop[L].push_back(&I); + } + } + } + } - // Replace load / store instructions with block ones. + // Optimize cases without loops. for (auto I : LoadStoreToProcess) - Changed = changeToBlockInst(I); + Changed |= changeToBlockInst(I); + + // Optimize cases with loops. Split loop into a remainder calculation and a new uniform loop. + // The remainder contains code divergence. + // The new loop will contain the main part of the loop without code divergence. + // + // For example: + // + // for (int idx = global_id_x + offset; idx < N; idx += simdsize) { + // A[idx] = B[idx]; + // } + // + // will be split into: + // + // if (global_id_x + offset < N - (N - offset) / simdsize * simdsize) { + // A[idx] = B[idx]; + // } + // + // for (int idx = global_id_x + offset + N - (N - offset) / simdsize * simdsize - offset; idx < N; idx += simdsize) { + // auto x = sg.load(&B[idx]); + // sg.store(&A[idx], x); + // } + // + for (const auto& Pair : LoadStoreInLoop) { + Loop *L = Pair.first; + BasicBlock *OldLatch = L->getLoopLatch(); + BasicBlock *OldPreheader = L->getLoopPreheader(); + PHINode *OldInductionPHI = L->getInductionVariable(*SE); + ICmpInst *OldLatchCmp = cast(cast(OldLatch->getTerminator())->getCondition()); + Value *OldLimit = OldLatchCmp->getOperand(1); + Value *OldIncomingIndV = OldInductionPHI->getIncomingValueForBlock(OldPreheader); + + SmallVector ExitBlocks; + L->getExitBlocks(ExitBlocks); + BasicBlock *Exit = ExitBlocks[0]; + + // Get BranchInst which defines the condition for entering the loop. + BranchInst *PreConditionBranch = cast(OldPreheader->getTerminator()); + if (!PreConditionBranch->isConditional()) + PreConditionBranch = cast((*pred_begin(OldPreheader))->getTerminator()); + ICmpInst *PreCondition = dyn_cast(PreConditionBranch->getCondition()); + + // Get offset for the initial value of the induction variable.. + SmallVector Offset; + if (!getOffset(OldIncomingIndV, Offset)) + continue; + + // Create a new basic block which will separate the remainder and the new loop. + LLVMContext &Context = OldLatch->getContext(); + BasicBlock *SeparatorBasicBlock = BasicBlock::Create(Context, ".separator", &F); + SeparatorBasicBlock->moveAfter(OldLatch); + + // Clone the loop. + ValueToValueMapTy VMap; + BasicBlock *ClonedLatch = CloneBasicBlock(OldLatch, VMap, ".new.loop", &F); + for (auto &I : *ClonedLatch) + RemapInstruction(&I, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + + // Clone the pre-condition and pre-condition branch instructions in the separator block. + ICmpInst *ClonedPreCondition = cast(PreCondition->clone()); + BranchInst *ClonedPreConditionBranch = cast(PreConditionBranch->clone()); + SeparatorBasicBlock->getInstList().push_back(ClonedPreCondition); + SeparatorBasicBlock->getInstList().push_back(ClonedPreConditionBranch); + + // Create empty exit for the new loop. + BasicBlock *ExitForTheNewLoop = BasicBlock::Create(Context, ".new.exit", &F); + ExitForTheNewLoop->moveAfter(ClonedLatch); + IRBuilder<> Builder(ExitForTheNewLoop); + Builder.CreateBr(Exit); + Changed = true; + + // Create empty preheader for the new loop. + BasicBlock *PreheaderForTheNewLoop = BasicBlock::Create(Context, ".new.preheader", &F); + PreheaderForTheNewLoop->moveAfter(SeparatorBasicBlock); + Builder.SetInsertPoint(PreheaderForTheNewLoop); + Builder.CreateBr(ClonedLatch); + + // Update the cloned pre-condition branch successors. + ClonedPreConditionBranch->setCondition(ClonedPreCondition); + ClonedPreConditionBranch->setSuccessor(0, PreheaderForTheNewLoop); + ClonedPreConditionBranch->setSuccessor(1, Exit); + + // Update the cloned latch branch successors. + BranchInst *ClonedLatchBranch = cast(ClonedLatch->getTerminator()); + ClonedLatchBranch->setSuccessor(0, ClonedLatch); + ClonedLatchBranch->setSuccessor(1, ExitForTheNewLoop); + + // Insert the cloned latch block after the separator block. + ClonedLatch->moveAfter(SeparatorBasicBlock); + + // Calculate the new limit for the remainder: + // newlimit = limit - (limit - offset1 - offset2) / simdsize * simdsize + // + // In IR it looks like: + // + // %suboffset1 = sub i32 %limit, %offset1 + // %suboffset2 = sub i32 %suboffset1, %offset2 + // %neg_qot = ashr i32 %suboffset2, log2(SimdSize) + // %qot = sub i32 0, %neg_qot + // %qotshl = shl i32 %qot, log2(SimdSize) + // %58 = add nsw i32 %limit, %qotshl + // + Type *LimitType = OldLimit->getType(); + + auto processOffset = [&](Value *SubArg) { + for (auto Val : Offset) { + if (!Val) + break; + + Value *OffsetVal = Val; + Type *ValType = Val->getType(); + if (LimitType != ValType) + OffsetVal = Builder.CreateZExt(Val, LimitType, "casted_offset"); + + SubArg = Builder.CreateSub(SubArg, OffsetVal); + } + return SubArg; + }; + + // Calculate the new limit (NewLimitFisrtLoop) for the remainder. + Builder.SetInsertPoint(PreCondition); + Value *SubOffset = processOffset(OldLimit); + + int LogSimdSizeBase2 = std::log2(SimdSize); + Value *AshrInst = Builder.CreateAShr(SubOffset, ConstantInt::get(LimitType, LogSimdSizeBase2), "ashr"); + Value *Neg = Builder.CreateSub(ConstantInt::get(LimitType, 0), AshrInst, "neg"); + Value *Shl = Builder.CreateShl(Neg, ConstantInt::get(LimitType, LogSimdSizeBase2)); + Value *NewLimitFisrtLoop = Builder.CreateAdd(OldLimit, Shl); + + // Update cmp instruction in the remainder and preheader with new limit. + PreCondition->setOperand(1, NewLimitFisrtLoop); + OldLatchCmp->setOperand(1, NewLimitFisrtLoop); + + // Calculate the induction variable initial value for the the new loop. + Builder.SetInsertPoint(SeparatorBasicBlock, SeparatorBasicBlock->getFirstInsertionPt()); + Value *OffsetForNewLoop = processOffset(NewLimitFisrtLoop); + + Value *NewIncInductiveVar = Builder.CreateAdd(OldIncomingIndV, OffsetForNewLoop); + + // Set operands for the cloned pre-condition. + ClonedPreCondition->setOperand(0, NewIncInductiveVar); + ClonedPreCondition->setOperand(1, OldLimit); + + // Substitude load/store instructions with block ones. + for (auto I : Pair.second) { + Instruction *NewI = cast(VMap[cast(I)]); + changeToBlockInst(NewI); + } + + std::vector PhiNodes; + // Set operands for phi instructions in the new loop and prepare initial values for the new loop. + for (auto &I : *OldLatch) { + if (!isa(&I)) + break; + + Value *IVal = cast(&I); + PHINode *Phi = cast(&I); + PHINode *NewPhi = dyn_cast(VMap[IVal]); + Value *OldIncomingV = Phi->getIncomingValueForBlock(OldPreheader); + PhiNodes.push_back(Phi); + + for (unsigned i = 0; i < Phi->getNumIncomingValues(); ++i) { + if (NewPhi->getIncomingBlock(i) == OldLatch) { + NewPhi->setIncomingBlock(i, ClonedLatch); + } else if (NewPhi->getIncomingBlock(i) == OldPreheader) { + NewPhi->setIncomingBlock(i, PreheaderForTheNewLoop); + } + } + + Value *NewInc = nullptr; + if (GetElementPtrInst *Gep = dyn_cast(OldIncomingV)) { + Type *GEPType = Gep->getResultElementType(); + NewInc = Builder.CreateGEP(GEPType, OldIncomingV, OffsetForNewLoop); + } else if (Phi == OldInductionPHI) { + NewInc = NewIncInductiveVar; + } + NewPhi->setIncomingValueForBlock(PreheaderForTheNewLoop, NewInc); + } + + // Erase phi instructions from values (make it if-statement). + for (auto Phi : PhiNodes) { + Value *OldIncomingV = Phi->getIncomingValueForBlock(OldPreheader); + Phi->replaceAllUsesWith(OldIncomingV); + Phi->eraseFromParent(); + } + + // Erase conditional branch from the old latch and creat unconditional branch. + BranchInst *OldLatchBranch = cast(OldLatch->getTerminator()); + Builder.SetInsertPoint(OldLatchBranch); + Builder.CreateBr(SeparatorBasicBlock); + OldLatchBranch->eraseFromParent(); + PreConditionBranch->setSuccessor(1, SeparatorBasicBlock); + } return Changed; } +using namespace llvm::PatternMatch; +bool GenerateBlockMemOpsPass::getOffset(Value *Init, SmallVector &Offset) { + Value *NonUnifOp = Init; + while (NonUnifOp) { + + if (ZExtInst *ZExt = dyn_cast(NonUnifOp)) { + NonUnifOp = ZExt->getOperand(0); + } else if (SExtInst *SExt = dyn_cast(NonUnifOp)) { + NonUnifOp = SExt->getOperand(0); + } else if (Instruction *Inst = dyn_cast(NonUnifOp)) { + if (Inst->getOpcode() != Instruction::Add) + return false; + + IGC::IGCMD::FunctionInfoMetaDataHandle FuncInfoMD = MdUtils->getFunctionsInfoItem(Inst->getFunction()); + IGC::IGCMD::ThreadGroupSizeMetaDataHandle ThreadGroupSize = FuncInfoMD->getThreadGroupSize(); + + // ThreadGroupSize should be specified. It is checked earlier in checkVectorizationAlongX function. + IGC_ASSERT(ThreadGroupSize->hasValue()); + int LogBase2 = std::log2((int32_t)ThreadGroupSize->getXDim()); + + // Check global_id_x pattern + Value *LocalIdX = nullptr; + Value *R0 = nullptr; + auto GlobalIdXPattern = m_Add(m_Shl(m_ExtractElt(m_Value(R0), m_SpecificInt(1)), m_SpecificInt(LogBase2)), m_Value(LocalIdX)); + if (match(NonUnifOp, GlobalIdXPattern)) { + if (ZExtInst *ZExt = dyn_cast(LocalIdX)) + LocalIdX = ZExt->getOperand(0); + + if (isLocalIdX(LocalIdX) && isR0(R0)) + return true; + } + + Value *Op0 = Inst->getOperand(0); + Value *Op1 = Inst->getOperand(1); + if (!WI->isUniform(Op1) && !WI->isUniform(Op0)) + return false; + + if (Offset.size() == 2) + return false; + + if (WI->isUniform(Op0)) { + Offset.push_back(Op0); + NonUnifOp = Op1; + } else { + Offset.push_back(Op1); + NonUnifOp = Op0; + } + } else { + return false; + } + } + + return false; +} + +bool GenerateBlockMemOpsPass::isLoopPattern(Loop *L) { + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + BasicBlock *Preheader = L->getLoopPreheader(); + PHINode *Phi = L->getInductionVariable(*SE); + + // Check that Loop has good shape so it safe to use llvm methods to work with it. + if (!L || !L->isSafeToClone() || (L->getNumBlocks() != 1) || !L->isLCSSAForm(*DT)) + return false; + + // Check that all parts of the loop can be found. + if (!Phi || !Preheader || !Latch || !Header) + return false; + + ICmpInst *LatchCmp = dyn_cast(cast(Latch->getTerminator())->getCondition()); + if (!LatchCmp) + return false; + + if (pred_size(Header) != 2) + return false; + + // Check that the loop has only one exit block. + SmallVector ExitBlocks; + L->getExitBlocks(ExitBlocks); + if (ExitBlocks.size() != 1) + return false; + + BasicBlock *Exit = ExitBlocks[0]; + + // Check that all values inside the loop have only internal users. + if (doesLoopHaveExternUse(L)) + return false; + + // Check that the loop has phi instructions of specific type. + if (!checkLoopPhiVals(L)) + return false; + + // Check that the induction variable is incremented by the simd size. + Instruction *Inc = dyn_cast(Phi->getIncomingValueForBlock(Latch)); + if (Inc->getOpcode() != Instruction::Add || (Inc->getOperand(0) != Phi && Inc->getOperand(1) != Phi)) + return false; + + ConstantInt *CI = dyn_cast(Inc->getOperand(0)); + if (!CI) + CI = dyn_cast(Inc->getOperand(1)); + if (!CI) + return false; + if (CI->getValue() != SimdSize) + return false; + + // Check that the loop condition is ULT or SLT. + CmpInst::Predicate Pred = LatchCmp->getPredicate(); + if (Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_SLT) + return false; + + // Loop limit should be uniform. + Value *Limit = LatchCmp->getOperand(1); + if (!WI->isUniform(Limit)) + return false; + + // Initial value for induction variable should be continuous. + Value *InitValForIndVar = Phi->getIncomingValueForBlock(Preheader); + if (!isIndexContinuous(InitValForIndVar)) + return false; + + // Find a conditional branch that defines if the loop should be executed. + // It can be placed in the preheader or in its single predecessor. + // This condition should match the condition in the loop latch. + BranchInst *PreConditionBranch = cast(Preheader->getTerminator()); + if (!PreConditionBranch->isConditional()) { + if (Preheader->size() != 1) + return false; + + PreConditionBranch = nullptr; + + if (Preheader->hasNPredecessors(1)) + PreConditionBranch = cast((*pred_begin(Preheader))->getTerminator()); + } + + if (!PreConditionBranch || !PreConditionBranch->isConditional()) + return false; + + ICmpInst *PreCondition = dyn_cast(PreConditionBranch->getCondition()); + if (!PreCondition || PreCondition->getPredicate() != Pred || PreCondition->getOperand(1) != Limit) + return false; + + if ((PreConditionBranch->getSuccessor(0) != Latch) && (PreConditionBranch->getSuccessor(0) != Preheader)) + return false; + + // That PreConditionBranch leads to the loop exit or to its single successor block. + if (PreConditionBranch->getSuccessor(1) != Exit) { + if (Exit->size() != 1) + return false; + + BranchInst *ExitBranch = cast(Exit->getTerminator()); + if (ExitBranch->isConditional()) + return false; + + if (ExitBranch->getSuccessor(0) != PreConditionBranch->getSuccessor(1)) + return false; + } + + return true; +} + +// Check that incoming values for phi instructions are getelementptr instructions except induction variable. +bool GenerateBlockMemOpsPass::checkLoopPhiVals(Loop *L) { + BasicBlock *Preheader = L->getLoopPreheader(); + BasicBlock *Latch = L->getLoopLatch(); + PHINode *IndPhi = L->getInductionVariable(*SE); + + for (auto &I : *Latch) { + PHINode *Phi = dyn_cast(&I); + if (!Phi) + break; + + Value *IncomingVal = Phi->getIncomingValueForBlock(Preheader); + Value *InternalVal = Phi->getIncomingValueForBlock(Latch); + + if (Phi != IndPhi) { + if (!isa(IncomingVal)) + return false; + + if (!isa(InternalVal)) + return false; + } + } + + return true; +} + +// Check that loop has only internal users. +bool GenerateBlockMemOpsPass::doesLoopHaveExternUse(Loop *L) { + // Expect that loop has only one exit block. It is checked earlier in checkLoopPattern function. + IGC_ASSERT(L->getNumBlocks() == 1); + + BasicBlock *Latch = L->getLoopLatch(); + for (auto &I : *Latch) { + for (auto UI = I.use_begin(), UE = I.use_end(); UI != UE; ++UI) { + Instruction *Inst = dyn_cast(*UI); + if (!Inst) + return true; + + if (Inst->getParent() != Latch) + return true; + } + } + + return false; +} + bool GenerateBlockMemOpsPass::isAddressAligned(Value *Ptr, const alignment_t &CurrentAlignment, Type *DataType) { unsigned ScalarSize = DataType->getScalarSizeInBits(); @@ -84,38 +525,70 @@ bool GenerateBlockMemOpsPass::isAddressAligned(Value *Ptr, const alignment_t &Cu // This function checks if Indx is equal to 1 * LocalIdX + UniformPart, assuming LocalIdY and LocalIdZ are uniform values. bool GenerateBlockMemOpsPass::isIndexContinuous(Value *Indx) { - Instruction *NonUnifInst = dyn_cast(Indx); - if (!NonUnifInst) - return false; + SmallVector NonUniformInstVector; + NonUniformInstVector.push_back(Indx); + PHINode *VisitedPhi = nullptr; - Value *NonUniformOp = nullptr; // Continuity requires that only add and zext operations can be performed on a non-uniform value. - while (NonUnifInst) { - if (isa(NonUnifInst)) { - NonUniformOp = NonUnifInst->getOperand(0); - } else if (NonUnifInst->getOpcode() == Instruction::Add) { - Value *Op0 = NonUnifInst->getOperand(0); - Value *Op1 = NonUnifInst->getOperand(1); - if (!WI->isUniform(Op1) && !WI->isUniform(Op0)) + while (NonUniformInstVector.size()) { + for (auto It = NonUniformInstVector.begin(); It != NonUniformInstVector.end();) { + Value *NonUnifOp = *It; + + if (!NonUnifOp) return false; - if (WI->isUniform(Op0)) { - NonUniformOp = Op1; - } else { - NonUniformOp = Op0; - } - } else { - return false; - } + NonUniformInstVector.erase(It); - // If local_id_x was met then index is continuous. - if (isLocalIdX(NonUniformOp)) - return true; + if (ZExtInst *ZExt = dyn_cast(NonUnifOp)) { + NonUniformInstVector.push_back(ZExt->getOperand(0)); + } else if (SExtInst *SExt = dyn_cast(NonUnifOp)) { + NonUniformInstVector.push_back(SExt ->getOperand(0)); + } else if (PHINode *Phi = dyn_cast(NonUnifOp)) { + // Check that PHINode has two incoming values and one of them + // is calculated from local_id_x and another one from this PHINode. + if (VisitedPhi && VisitedPhi != Phi) + return false; + + if (VisitedPhi) + continue; + + unsigned NumIncomingValues = Phi->getNumIncomingValues(); + + if (NumIncomingValues != 2) + return false; + + for (Use &U : Phi->incoming_values()) { + Value *V = U.get(); + if (WI->isUniform(V)) + return false; - NonUnifInst = dyn_cast(NonUniformOp); + NonUniformInstVector.push_back(V); + } + VisitedPhi = Phi; + } else if (Instruction *Inst = dyn_cast(NonUnifOp)) { + if (Inst->getOpcode() != Instruction::Add) + return false; + + Value *Op0 = Inst->getOperand(0); + Value *Op1 = Inst->getOperand(1); + + + if (!WI->isUniform(Op1) && !WI->isUniform(Op0)) + return false; + + if (WI->isUniform(Op0)) { + NonUniformInstVector.push_back(Op1); + } else { + NonUniformInstVector.push_back(Op0); + } + } else if (!isLocalIdX(NonUnifOp)) { + // If local_id_x was met then index is continuous. + return false; + } + } } - return false; + return true; } @@ -123,40 +596,33 @@ bool GenerateBlockMemOpsPass::checkVectorizationAlongX(Function *F) { if (CGCtx->type != ShaderType::OPENCL_SHADER) return false; - IGC::IGCMD::FunctionInfoMetaDataHandle funcInfoMD = MdUtils->getFunctionsInfoItem(F); - ModuleMetaData *modMD = CGCtx->getModuleMetaData(); - auto funcMD = modMD->FuncMD.find(F); + IGC::IGCMD::FunctionInfoMetaDataHandle FuncInfoMD = MdUtils->getFunctionsInfoItem(F); + ModuleMetaData *ModMD = CGCtx->getModuleMetaData(); + auto FuncMD = ModMD->FuncMD.find(F); - if (funcMD == modMD->FuncMD.end()) + if (FuncMD == ModMD->FuncMD.end()) return false; - WorkGroupWalkOrderMD workGroupWalkOrder = funcMD->second.workGroupWalkOrder; - if (workGroupWalkOrder.dim0 != 0 || workGroupWalkOrder.dim1 != 1 || workGroupWalkOrder.dim2 != 2) + WorkGroupWalkOrderMD WorkGroupWalkOrder = FuncMD->second.workGroupWalkOrder; + if (WorkGroupWalkOrder.dim0 != 0 || WorkGroupWalkOrder.dim1 != 1 || WorkGroupWalkOrder.dim2 != 2) return false; int32_t X = -1; - IGC::IGCMD::ThreadGroupSizeMetaDataHandle threadGroupSize = funcInfoMD->getThreadGroupSize(); - if (!threadGroupSize->hasValue()) + IGC::IGCMD::ThreadGroupSizeMetaDataHandle ThreadGroupSize = FuncInfoMD->getThreadGroupSize(); + if (!ThreadGroupSize->hasValue()) return false; - X = (int32_t)threadGroupSize->getXDim(); + X = (int32_t)ThreadGroupSize->getXDim(); if (!X) return false; - if (X % MaxSgSize == 0) + if (X % SimdSize == 0) return true; return false; } bool GenerateBlockMemOpsPass::canOptLoadStore(Instruction *I) { - if (!isa(I) && !isa(I)) - return false; - - // Block read and write instructions must be called by all elements in the subgroup. - if (WI->insideDivergentCF(I)) - return false; - Value *Ptr = nullptr; Value *ValOp = nullptr; Type *DataType = nullptr; @@ -181,9 +647,9 @@ bool GenerateBlockMemOpsPass::canOptLoadStore(Instruction *I) { if (!isAddressAligned(Ptr, CurrentAlignment, DataType)) return false; - GetElementPtrInst *Gep = dyn_cast(Ptr); // Get the last index from the getelementptr instruction if it is not uniform in the subgroup. - Value *Idx = checkGep(Gep); + Instruction *PtrInstr = dyn_cast(Ptr); + Value *Idx = checkGep(PtrInstr); if (!Idx) return false; @@ -202,9 +668,21 @@ bool GenerateBlockMemOpsPass::isLocalIdX(const Value *InputVal) { Function *F = const_cast(A->getParent()); ImplicitArgs implicitArgs(*F, MdUtils); Value *localIdX = implicitArgs.getImplicitArgValue(*F, ImplicitArg::LOCAL_ID_X, MdUtils); + return A == localIdX; } +bool GenerateBlockMemOpsPass::isR0(const Value *InputVal) { + const Argument *A = dyn_cast(InputVal); + if (!A) + return false; + Function *F = const_cast(A->getParent()); + ImplicitArgs implicitArgs(*F, MdUtils); + Value *R0 = implicitArgs.getImplicitArgValue(*F, ImplicitArg::R0, MdUtils); + + return A == R0; +} + bool GenerateBlockMemOpsPass::changeToBlockInst(Instruction *I) { IRBuilder<> Builder(I); Function *BlockOpDecl = nullptr; @@ -235,7 +713,31 @@ bool GenerateBlockMemOpsPass::changeToBlockInst(Instruction *I) { return true; } -Value *GenerateBlockMemOpsPass::checkGep(GetElementPtrInst *Gep) { +Value *GenerateBlockMemOpsPass::checkGep(Instruction *PtrInstr) { + if (!PtrInstr) + return nullptr; + + PHINode *Phi = dyn_cast(PtrInstr); + GetElementPtrInst *Gep = nullptr; + if (Phi) { + unsigned NumIncomingValues = Phi->getNumIncomingValues(); + if (NumIncomingValues != 2) + return nullptr; + + BasicBlock *BB = PtrInstr->getParent(); + Loop *L = LI->getLoopFor(BB); + BasicBlock *Preheader = L->getLoopPreheader(); + + Value *IncomingVal1 = Phi->getIncomingValueForBlock(Preheader); + + Gep = dyn_cast(IncomingVal1); + + if (!Gep) + return nullptr; + } else { + Gep = dyn_cast(PtrInstr); + } + if (!Gep) return nullptr; @@ -259,10 +761,41 @@ Value *GenerateBlockMemOpsPass::checkGep(GetElementPtrInst *Gep) { if (!IsLastIndUniform && IsPtrUniform) { return *LIndx; } else if (IsLastIndUniform && !IsPtrUniform) { - if (!isa(Ptr)) + if (!isa(Ptr) && !isa(Ptr)) return nullptr; - return checkGep(cast(Ptr)); + if (PHINode *Phi = dyn_cast(Ptr)) { + if (Phi->getNumIncomingValues() != 2) + return nullptr; + + for (Use &U : Phi->incoming_values()) { + Value *V = U.get(); + + if (!isa(V)) + return nullptr; + + GetElementPtrInst *G = cast(V); + + bool IsGepHasPhiArg = false; + if (G->getOperand(0) == Phi) { + // Check that the address was incremented using gep instruction and the value incrementation is uniform. + IsGepHasPhiArg = true; + for (auto Idx = G->idx_begin(), E = G->idx_end(); Idx != E; Idx++) { + if (!WI->isUniform(*Idx)) { + return nullptr; + } + } + } else { + // Get the incoming address value. + Ptr = V; + } + + if (!IsGepHasPhiArg) + return nullptr; + } + } + + return checkGep(dyn_cast(Ptr)); } return nullptr; diff --git a/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.hpp b/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.hpp index 4748647da5c3..7decfd9bbbc3 100644 --- a/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.hpp +++ b/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.hpp @@ -10,6 +10,8 @@ SPDX-License-Identifier: MIT #include "common/LLVMWarningsPush.hpp" #include +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "common/LLVMWarningsPop.hpp" #include "Compiler/CISACodeGen/WIAnalysis.hpp" #include "GenISAIntrinsics/GenIntrinsicInst.h" @@ -29,24 +31,35 @@ class GenerateBlockMemOpsPass : public llvm::FunctionPass } virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override { - AU.setPreservesCFG(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); AU.addRequired(); } virtual bool runOnFunction(llvm::Function &F) override; private: - llvm::Value *checkGep(llvm::GetElementPtrInst *Gep); + llvm::Value *checkGep(llvm::Instruction *Gep); bool isLocalIdX(const llvm::Value *InputVal); + bool isR0(const llvm::Value *InputVal); bool isAddressAligned(llvm::Value *Ptr, const alignment_t &CurrentAlignment, llvm::Type *DataType); bool isIndexContinuous(llvm::Value *Addr); bool checkVectorizationAlongX(llvm::Function *F); + bool checkLoopPhiVals(llvm::Loop *L); bool changeToBlockInst(llvm::Instruction *I); + bool doesLoopHaveExternUse(llvm::Loop *L); + bool getOffset(llvm::Value *Init, llvm::SmallVector &Offset); bool canOptLoadStore(llvm::Instruction *I); + bool isLoopPattern(llvm::Loop *L); WIAnalysis *WI = nullptr; IGC::CodeGenContext *CGCtx = nullptr; IGC::IGCMD::MetaDataUtils *MdUtils = nullptr; + llvm::DominatorTree *DT = nullptr; + llvm::LoopInfo *LI; + llvm::ScalarEvolution *SE; + size_t SimdSize = 0; }; } \ No newline at end of file diff --git a/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp b/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp index 7b17b7e7b69c..f1a8a172b4c9 100644 --- a/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp +++ b/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp @@ -1408,12 +1408,6 @@ void OptimizeIR(CodeGenContext* const pContext) mpm.add(new IGCConstProp()); GFX_ONLY_PASS { mpm.add(createTranslateToProgrammableOffsetsPass()); } - // This pass needs to be extended for other devices - if (pContext->platform.getPlatformInfo().eProductFamily == IGFX_PVC) - { - mpm.add(new GenerateBlockMemOpsPass()); - } - mpm.add(new BlockMemOpAddrScalarizationPass()); mpm.add(new CustomSafeOptPass()); if (!pContext->m_DriverInfo.WADisableCustomPass()) @@ -1477,7 +1471,20 @@ void OptimizeIR(CodeGenContext* const pContext) mpm.add(llvm::createLoopRotatePass(LOOP_ROTATION_HEADER_INST_THRESHOLD)); mpm.add(llvm::createLCSSAPass()); mpm.add(llvm::createLoopSimplifyPass()); + } + } + // This pass needs to be extended for other devices + if (pContext->platform.getPlatformInfo().eProductFamily == IGFX_PVC) + { + mpm.add(new GenerateBlockMemOpsPass()); + } + mpm.add(new BlockMemOpAddrScalarizationPass()); + + if (pContext->m_instrTypes.hasMultipleBB && !disableGOPT) + { + if (pContext->m_instrTypes.numOfLoop) + { bool allowLICM = IGC_IS_FLAG_ENABLED(allowLICM) && pContext->m_retryManager.AllowLICM(); bool runGEPLSR = IGC_IS_FLAG_ENABLED(EnableGEPLSR) && pContext->type == ShaderType::OPENCL_SHADER && diff --git a/IGC/Compiler/tests/GenerateBlockMemOpsPass/block_read_write_check.ll b/IGC/Compiler/tests/GenerateBlockMemOpsPass/block_read_write_check.ll index 6795d9a9ca30..9a275a06a5a6 100644 --- a/IGC/Compiler/tests/GenerateBlockMemOpsPass/block_read_write_check.ll +++ b/IGC/Compiler/tests/GenerateBlockMemOpsPass/block_read_write_check.ll @@ -60,7 +60,7 @@ entry: %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv.i store float %2, float addrspace(1)* %arrayidx1, align 4 - ; CHECK-NOT: %{{.*}} = simdBlockWrite + ; CHECK-NOT: simdBlockWrite ret void @@ -68,14 +68,47 @@ entry: } -!igc.functions = !{!1, !2} +define spir_kernel void @testYZUnifLoop(float addrspace(1)* %out, float addrspace(1)* %in, <8 x i32> %r0, <8 x i32> %payloadHeader, <3 x i32> %localSize, i16 %localIdX, i16 %localIdY, i16 %localIdZ, i32 %bufferOffset, i64 %limit) { +; CHECK: %{{.*}} = load +; CHECK: store +; CHECK: [[TMP0:%.*]] = call float @llvm.genx.GenISA.simdBlockRead.f32.p1f32(float addrspace(1)* %{{.*}}) +; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1f32.f32(float addrspace(1)* %{{.*}}, float [[TMP0]]) +entry: + %offset = extractelement <8 x i32> %payloadHeader, i64 0 + %groupNumX = extractelement <8 x i32> %r0, i64 1 + %shl = shl i32 %groupNumX, 5 + %localIdX31 = zext i16 %localIdX to i32 + %globalIdX = add i32 %shl, %localIdX31 + %sum = add i32 %globalIdX, %offset + %sum64 = zext i32 %sum to i64 + %precond = icmp slt i64 %sum64, %limit + br i1 %precond, label %preheader, label %terminator +preheader: + br label %latch +latch: + %ind = phi i64 [ %sum64, %preheader ], [ %incr, %latch ] + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %ind + %load = load float, float addrspace(1)* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %ind + store float %load, float addrspace(1)* %arrayidx1, align 4 + %incr = add nsw i64 %ind, 32 + %cond = icmp slt i64 %incr, %limit + br i1 %cond, label %latch, label %exit +exit: + br label %terminator +terminator: + ret void +} + +!igc.functions = !{!1, !2, !3} !IGCMetadata = !{!19} !1 = !{void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i32)* @testYZUnif, !41} !2 = !{void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i32)* @testNoUnif, !42} +!3 = !{void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i64)* @testYZUnifLoop, !43} !41 = !{!5, !6, !17} !42 = !{!5, !6} -!43 = !{!5, !6, !18} +!43 = !{!5, !6, !17} !5 = !{!"function_type", i32 0} !6 = !{!"implicit_arg_desc", !7, !8, !9, !10, !11, !12, !13, !15} !7 = !{i32 0} @@ -99,11 +132,13 @@ entry: !18 = !{!"thread_group_size", i32 16, i32 32, i32 32} !19 = !{!"ModuleMD", !112} -!112 = !{!"FuncMD", !113, !114, !333, !334} +!112 = !{!"FuncMD", !113, !114, !333, !334, !335, !336} !113 = !{!"FuncMDMap[0]", void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i32)* @testYZUnif} !114 = !{!"FuncMDValue[0]", !116} !333 = !{!"FuncMDMap[1]", void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i32)* @testNoUnif} !334 = !{!"FuncMDValue[1]", !116} +!335 = !{!"FuncMDMap[2]", void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i64)* @testYZUnifLoop} +!336 = !{!"FuncMDValue[2]", !116} !116 = !{!"workGroupWalkOrder", !117, !118, !119} !117 = !{!"dim0", i32 0} !118 = !{!"dim1", i32 1}