diff --git a/IGC/Compiler/Optimizer/Scalarizer.cpp b/IGC/Compiler/Optimizer/Scalarizer.cpp index 36486b4d8ea7..33e4a488a3e5 100644 --- a/IGC/Compiler/Optimizer/Scalarizer.cpp +++ b/IGC/Compiler/Optimizer/Scalarizer.cpp @@ -23,8 +23,8 @@ SPDX-License-Identifier: MIT #include "common/LLVMWarningsPop.hpp" #include "common/igc_regkeys.hpp" #include "common/Types.hpp" -#include #include "Probe/Assertion.h" +#include using namespace llvm; using namespace IGC; @@ -62,6 +62,8 @@ ScalarizeFunction::ScalarizeFunction(bool selectiveScalarization) : FunctionPass initializeScalarizeFunctionPass(*PassRegistry::getPassRegistry()); for (int i = 0; i < Instruction::OtherOpsEnd; i++) m_transposeCtr[i] = 0; + + // Needs IGC_EnableSelectiveScalarizer = 1 m_SelectiveScalarization = selectiveScalarization; // Initialize SCM buffers and allocation @@ -70,14 +72,17 @@ ScalarizeFunction::ScalarizeFunction(bool selectiveScalarization) : FunctionPass m_SCMArrayLocation = 0; V_PRINT(scalarizer, "ScalarizeFunction constructor\n"); + V_PRINT(scalarizer, "IGC_EnableSelectiveScalarizer = "); + V_PRINT(scalarizer, IGC_IS_FLAG_ENABLED(EnableSelectiveScalarizer)); + V_PRINT(scalarizer, "\n"); } -ScalarizeFunction::~ScalarizeFunction() -{ +bool ScalarizeFunction::doFinalization(llvm::Module& M) { releaseAllSCMEntries(); delete[] m_SCMAllocationArray; destroyDummyFunc(); - V_PRINT(scalarizer, "ScalarizeFunction destructor\n"); + V_PRINT(scalarizer, "ScalarizeFunction doFinalization\n"); + return true; } bool ScalarizeFunction::runOnFunction(Function& F) @@ -157,7 +162,7 @@ bool ScalarizeFunction::runOnFunction(Function& F) for (; index != re; ++index) { // get rid of old users - if (Value * val = dyn_cast(*index)) + if (Value* val = dyn_cast(*index)) { UndefValue* undefVal = UndefValue::get((*index)->getType()); (val)->replaceAllUsesWith(undefVal); @@ -171,13 +176,18 @@ bool ScalarizeFunction::runOnFunction(Function& F) } /// -/// @brief We want to avoid scalarize vector-phi node if the vector is used +/// @brief We want to avoid scalarization of vector instructions if the vector is used /// as a whole entity somewhere in the program. This function tries to find /// this kind of definition web that involves phi-node, insert-element etc, /// then add them into the exclusion-set (excluded from scalarization). /// void ScalarizeFunction::buildExclusiveSet() { + + auto isAddToWeb = [](Value* V) -> bool { + return isa(V) || isa(V); + }; + auto DT = &getAnalysis().getDomTree(); for (auto dfi = df_begin(DT->getRootNode()), dfe = df_end(DT->getRootNode()); dfi != dfe; ++dfi) @@ -190,7 +200,10 @@ void ScalarizeFunction::buildExclusiveSet() Instruction* currInst = &*sI; ++sI; // find the seed for the workset - std::vector workset; + std::vector workset; + + // Instructions that accept vectorial arguments can end legs of the web + // i.e. the instructions that produce the vectorial arguments may be protected from scalarization if (GenIntrinsicInst* GII = dyn_cast(currInst)) { unsigned numOperands = IGCLLVM::getNumArgOperands(GII); @@ -203,6 +216,16 @@ void ScalarizeFunction::buildExclusiveSet() } } } + else if (CallInst * CI = dyn_cast(currInst)) + { + for (auto arg = CI->arg_begin(); arg != CI->arg_end(); ++arg) + { + if (isa(arg->get()->getType())) + { + workset.push_back(arg->get()); + } + } + } else if (auto IEI = dyn_cast(currInst)) { Value* scalarIndexVal = IEI->getOperand(2); @@ -219,9 +242,12 @@ void ScalarizeFunction::buildExclusiveSet() workset.push_back(EEI->getOperand(0)); } } - // try to find a phi-web from the seed - bool HasPHI = false; - std::set defweb; + else if (BitCastInst* BCI = dyn_cast(currInst)) + { + workset.push_back(BCI->getOperand(0)); + } + // try to find a web from the seed + std::set defweb; while (!workset.empty()) { auto Def = workset.back(); @@ -230,70 +256,45 @@ void ScalarizeFunction::buildExclusiveSet() { continue; } - if (auto IEI = dyn_cast(Def)) - { - defweb.insert(IEI); - if (!defweb.count(IEI->getOperand(0)) && - (isa(IEI->getOperand(0)) || - isa(IEI->getOperand(0)) || - isa(IEI->getOperand(0)))) - { - workset.push_back(IEI->getOperand(0)); - } - } - else if (auto SVI = dyn_cast(Def)) + + // The web grows "up" through BitCasts and PHI nodes + // but insert/extract elements and vector shuffles should be scalarized + if (!isAddToWeb(Def)) continue; + + if (BitCastInst* BCI = dyn_cast(Def)) { - defweb.insert(SVI); - if (!defweb.count(SVI->getOperand(0)) && - (isa(SVI->getOperand(0)) || - isa(SVI->getOperand(0)) || - isa(SVI->getOperand(0)))) - { - workset.push_back(SVI->getOperand(0)); - } - if (!defweb.count(SVI->getOperand(1)) && - (isa(SVI->getOperand(1)) || - isa(SVI->getOperand(1)) || - isa(SVI->getOperand(1)))) + defweb.insert(BCI); + if (!defweb.count(BCI->getOperand(0)) && isAddToWeb(BCI->getOperand(0))) { - workset.push_back(SVI->getOperand(1)); + workset.push_back(BCI->getOperand(0)); } } else if (auto PHI = dyn_cast(Def)) { defweb.insert(PHI); - HasPHI = true; // !this def-web is qualified! for (int i = 0, n = PHI->getNumOperands(); i < n; ++i) - if (!defweb.count(PHI->getOperand(i)) && - (isa(PHI->getOperand(i)) || - isa(PHI->getOperand(i)) || - isa(PHI->getOperand(i)))) + { + if (!defweb.count(PHI->getOperand(i)) && isAddToWeb(PHI->getOperand(i))) { workset.push_back(PHI->getOperand(i)); } + } } else { continue; } - // check use + + // The web grows "down" through BitCasts and PHI nodes as well for (auto U : Def->users()) { - if (!defweb.count(U) && - (isa(U) || - isa(U) || - isa(U))) + if (!defweb.count(U) && isAddToWeb(U)) { workset.push_back(U); } } } - // if we find a qualified web with PHINode, add those instructions - // into the exclusion set - if (HasPHI) - { - m_Excludes.merge(defweb); - } + m_Excludes.merge(defweb); } } } @@ -390,7 +391,7 @@ void ScalarizeFunction::recoverNonScalarizableInst(Instruction* Inst) if (isa(Inst->getType())) getSCMEntry(Inst); // Iterate over all arguments. Check that they all exist (or rebuilt) - if (CallInst * CI = dyn_cast(Inst)) + if (CallInst* CI = dyn_cast(Inst)) { unsigned numOperands = IGCLLVM::getNumArgOperands(CI); for (unsigned i = 0; i < numOperands; i++) @@ -508,7 +509,7 @@ void ScalarizeFunction::scalarizeInstruction(BinaryOperator* BI) BI->getName(), BI ); - if (BinaryOperator * BO = dyn_cast(Val)) { + if (BinaryOperator* BO = dyn_cast(Val)) { // Copy overflow flags if any. if (isa(BO)) { BO->setHasNoSignedWrap(BI->hasNoSignedWrap()); @@ -609,7 +610,7 @@ void ScalarizeFunction::scalarizeInstruction(CastInst* CI) "unexpected type!"); IGC_ASSERT_MESSAGE( cast(CI->getOperand(0)->getType()) - ->getNumElements() == numElements, + ->getNumElements() == numElements, "unexpected vector width"); // Obtain scalarized argument @@ -666,7 +667,7 @@ void ScalarizeFunction::scalarizeInstruction(PHINode* PI) { auto* Op = PI->getIncomingValue(i); - if (auto * GII = dyn_cast(Op)) + if (auto* GII = dyn_cast(Op)) { switch (GII->getIntrinsicID()) { @@ -694,7 +695,7 @@ void ScalarizeFunction::scalarizeInstruction(PHINode* PI) phis.pop_back(); for (auto U : PN->users()) { - if (GenIntrinsicInst * GII = dyn_cast(U)) + if (GenIntrinsicInst* GII = dyn_cast(U)) { switch (GII->getIntrinsicID()) { @@ -703,11 +704,16 @@ void ScalarizeFunction::scalarizeInstruction(PHINode* PI) case GenISAIntrinsic::GenISA_sub_group_dpas: case GenISAIntrinsic::GenISA_dpas: case GenISAIntrinsic::GenISA_simdBlockWrite: + case GenISAIntrinsic::GenISA_simdBlockWriteBindless: + case GenISAIntrinsic::GenISA_simdMediaBlockWrite: + case GenISAIntrinsic::GenISA_LSC2DBlockWrite: + case GenISAIntrinsic::GenISA_LSC2DBlockWriteAddrPayload: + case GenISAIntrinsic::GenISA_LSCStoreBlock: recoverNonScalarizableInst(PI); return; } } - else if (PHINode * N = dyn_cast(U)) + else if (PHINode* N = dyn_cast(U)) { if (visited.count(N) == 0) { visited[N] = 1; @@ -720,7 +726,6 @@ void ScalarizeFunction::scalarizeInstruction(PHINode* PI) phis.clear(); } - // Prepare empty SCM entry for the instruction SCMEntry* newEntry = getSCMEntry(PI); @@ -1047,7 +1052,7 @@ void ScalarizeFunction::scalarizeInstruction(GetElementPtrInst* GI) auto op1 = baseValue->getType()->isVectorTy() ? operand1[i] : baseValue; auto op2 = indexValue->getType()->isVectorTy() ? operand2[i] : indexValue; - Type *BaseTy = IGCLLVM::getNonOpaquePtrEltTy(op1->getType()); + Type* BaseTy = IGCLLVM::getNonOpaquePtrEltTy(op1->getType()); Value* newGEP = GetElementPtrInst::Create(BaseTy, op1, op2, VALUE_NAME(GI->getName()), GI); Value* constIndex = ConstantInt::get(Type::getInt32Ty(context()), i); @@ -1123,7 +1128,7 @@ void ScalarizeFunction::obtainScalarizedValues(SmallVectorImpl& retValue retValues[i + destIdx] = undefElement; } } - else if (Constant * vectorConst = dyn_cast(origValue)) + else if (Constant* vectorConst = dyn_cast(origValue)) { V_PRINT(scalarizer, "\t\t\tProper constant: " << *vectorConst << "\n"); // Value is a constant. Break it down to scalars by employing a constant expression @@ -1310,7 +1315,7 @@ void ScalarizeFunction::updateSCMEntryWithValues(ScalarizeFunction::SCMEntry* en if (matchDbgLoc) { - if (const Instruction * origInst = dyn_cast(origValue)) + if (const Instruction* origInst = dyn_cast(origValue)) { for (unsigned i = 0; i < width; ++i) { @@ -1347,17 +1352,17 @@ void ScalarizeFunction::resolveDeferredInstructions() // lambda to check if a value is a dummy instruction auto isDummyValue = [this](Value* val) - { - auto* call = dyn_cast(val); - if (!call) return false; - // If the Value is one of the dummy functions that we created. - for (const auto& function : createdDummyFunctions) { - if (call->getCalledFunction() == function.second) - return true; - } + { + auto* call = dyn_cast(val); + if (!call) return false; + // If the Value is one of the dummy functions that we created. + for (const auto& function : createdDummyFunctions) { + if (call->getCalledFunction() == function.second) + return true; + } - return false; - }; + return false; + }; for (auto deferredEntry = m_DRL.begin(); m_DRL.size() > 0;) { @@ -1395,8 +1400,8 @@ void ScalarizeFunction::resolveDeferredInstructions() newInsts.resize(width); for (unsigned i = 0; i < width; i++) { - Value *constIndex = ConstantInt::get(Type::getInt32Ty(context()), i); - Instruction *EE = ExtractElementInst::Create(vectorInst, constIndex, + Value* constIndex = ConstantInt::get(Type::getInt32Ty(context()), i); + Instruction* EE = ExtractElementInst::Create(vectorInst, constIndex, VALUE_NAME(vectorInst->getName() + ".scalar"), &(*insertLocation)); newInsts[i] = EE; } @@ -1417,7 +1422,7 @@ void ScalarizeFunction::resolveDeferredInstructions() // It's possible the scalar values are not resolved earlier and are themselves dummy instructions. // In order to find the real value, we look in the map to see which value replaced it. if (dummyToScalarMap.count(scalarVal)) - scalarVal = dummyToScalarMap[scalarVal]; + scalarVal = dummyToScalarMap[scalarVal]; else totallyResolved = false; } @@ -1441,10 +1446,10 @@ void ScalarizeFunction::resolveDeferredInstructions() } } - for (const auto &entry : dummyToScalarMap) + for (const auto& entry : dummyToScalarMap) { // Replace and erase all dummy instructions (don't use eraseFromParent as the dummy is not in the function) - Instruction *dummyInst = cast(entry.first); + Instruction* dummyInst = cast(entry.first); dummyInst->replaceAllUsesWith(entry.second); dummyInst->deleteValue(); } @@ -1453,9 +1458,8 @@ void ScalarizeFunction::resolveDeferredInstructions() m_DRL.clear(); } -extern "C" FunctionPass* createScalarizerPass(bool selectiveScalarization) +extern "C" FunctionPass * createScalarizerPass(bool selectiveScalarization) { return new ScalarizeFunction(selectiveScalarization); } - diff --git a/IGC/Compiler/Optimizer/Scalarizer.h b/IGC/Compiler/Optimizer/Scalarizer.h index 5709f52c02fe..67887eab8332 100644 --- a/IGC/Compiler/Optimizer/Scalarizer.h +++ b/IGC/Compiler/Optimizer/Scalarizer.h @@ -10,11 +10,11 @@ SPDX-License-Identifier: MIT #include "common/LLVMWarningsPush.hpp" #include -#include "llvm/IR/Dominators.h" +#include #include #include #include -#include "llvm/ADT/MapVector.h" +#include #include #include #include @@ -23,13 +23,10 @@ SPDX-License-Identifier: MIT #include #include #include -#include "common/LLVMWarningsPop.hpp" #include - -#include -#include +#include "common/LLVMWarningsPop.hpp" #include -#include +#include namespace IGC { @@ -54,10 +51,9 @@ namespace IGC public: static char ID; // Pass identification, replacement for typeid - ScalarizeFunction(bool selectiveScalarization = false); + ScalarizeFunction(bool selectiveScalarization = true); ScalarizeFunction(const ScalarizeFunction&) = delete; ScalarizeFunction& operator=(const ScalarizeFunction&) = delete; - ~ScalarizeFunction(); /// @brief Provides name of pass virtual llvm::StringRef getPassName() const override @@ -72,12 +68,14 @@ namespace IGC AU.setPreservesCFG(); } + virtual bool doFinalization(llvm::Module& M) override; virtual bool runOnFunction(llvm::Function& F) override; private: /// @brief select an exclusive set that would not be scalarized void buildExclusiveSet(); + /// @brief main Method for dispatching instructions (according to inst type) for scalarization /// @param I instruction to dispatch void dispatchInstructionToScalarize(llvm::Instruction* I); @@ -190,7 +188,24 @@ namespace IGC inline llvm::Function* getOrCreateDummyFunc(llvm::Type* dummyType, llvm::Module* module) { if (createdDummyFunctions.find(dummyType) == createdDummyFunctions.end()) { llvm::FunctionType* funcType = llvm::FunctionType::get(dummyType, false); - llvm::Function* function = llvm::Function::Create(funcType, llvm::Function::InternalLinkage, "", module); + // Below: change of Internal linkage to External + // + // Dummy functions are tools used by the pass and they are never defined. + // If any dummy functions survive, they are removed in the destructor of the pass. + // Thus, the change of the linkage does not impact the net effect of the pass. + // + // The change is due to the fact that erasing dummy functions in the destructor is not thread-safe. + // In my runs of "igc_opt" the LLVM IR code generation would begin before the destructor call. + // This crashes LLVM due to the presence of undefined functions. + // + // It's difficult to properly fix this bug without significant changes to the pass. + // Unfortunately, overriding doFinalization does not resolve the problem. + // + // By changing internal linkage to external, "real-life" compilations go as before: + // the destructor always gets called, as there are many other passes in the pipeline. + // In the testing conditions, however, the LLVM does not crash anymore, + // but declarations of external functions may appear in the LLVM IR. + llvm::Function* function = llvm::Function::Create(funcType, llvm::Function::ExternalLinkage, "", module); createdDummyFunctions[dummyType] = function; return function; } @@ -250,5 +265,11 @@ namespace IGC } // namespace IGC -/// By default (no argument given to this function), vector load/store are kept as is. -extern "C" llvm::FunctionPass* createScalarizerPass(bool selectiveScalarization = false); +/// @brief By default (no argument given to this function), selective scalarization is off. +/// Selective scalarization keeps some instructions vectorized, if the vector is used as the whole entity. +/// The pass builds a web of instructions protected from scalarization. +/// The ending legs of the web consist of vectorial instructions such as insert and extract elements, +/// vector shuffles, GenISA intrinsics and function calls. +/// The vectorial instructions inside the web consist of bitcasts and PHI nodes. +extern "C" llvm::FunctionPass * createScalarizerPass(bool selectiveScalarization = false); + diff --git a/IGC/Compiler/tests/ScalarizeFunction/selective.ll b/IGC/Compiler/tests/ScalarizeFunction/selective.ll new file mode 100644 index 000000000000..e20d3511b2f2 --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/selective.ll @@ -0,0 +1,184 @@ +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2022 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; REQUIRES: regkeys +; RUN: igc_opt --igc-scalarize -S --regkey=EnableSelectiveScalarizer=1 < %s | FileCheck %s +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; This test checks if selective scalarization leaves vectorial instructions un-scalarized. +; ------------------------------------------------ + +define spir_kernel void @test_selective_1(i64 %addr) #0 { +; CHECK-LABEL: @test_selective_1( +; CHECK: [[VECT_INT:%.*]] = add <8 x i32> , zeroinitializer +; CHECK: [[VECT_FLOAT:%.*]] = bitcast <8 x i32> [[VECT_INT]] to <8 x float> +; CHECK: [[VECT_INT_2:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32> +; CHECK: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECT_INT_2]]) +; CHECK: ret void +; + +; define a vector and do some bitcasts +; nothing should get scalarized here + + %vectint = add <8 x i32> , zeroinitializer + %vectfloat = bitcast <8 x i32> %vectint to <8 x float> + %vectcast = bitcast <8 x float> %vectfloat to <8 x i32> + call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast) + + ret void +} + +define spir_kernel void @test_selective_2(i64 %addr) #0 { +; CHECK-LABEL: @test_selective_2( +; CHECK: [[VECT_INT:%.*]] = add <8 x i32> , zeroinitializer +; CHECK: [[VECT_FLOAT:%.*]] = bitcast <8 x i32> [[VECT_INT]] to <8 x float> +; CHECK: [[VECT_INT_2:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32> +; CHECK: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECT_INT_2]]) +; CHECK: [[CAST:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32> +; CHECK: [[SCALAR_0:%.*]] = extractelement <8 x i32> [[CAST]], i32 0 +; CHECK: [[SCALAR_1:%.*]] = extractelement <8 x i32> [[CAST]], i32 1 +; CHECK: [[SCALAR_2:%.*]] = extractelement <8 x i32> [[CAST]], i32 2 +; CHECK: [[SCALAR_3:%.*]] = extractelement <8 x i32> [[CAST]], i32 3 +; CHECK: [[SCALAR_4:%.*]] = extractelement <8 x i32> [[CAST]], i32 4 +; CHECK: [[SCALAR_5:%.*]] = extractelement <8 x i32> [[CAST]], i32 5 +; CHECK: [[SCALAR_6:%.*]] = extractelement <8 x i32> [[CAST]], i32 6 +; CHECK: [[SCALAR_7:%.*]] = extractelement <8 x i32> [[CAST]], i32 7 +; CHECK: [[ADD:%.*]] = add i32 [[SCALAR_3]], [[SCALAR_5]] +; CHECK: ret void +; +; same as before, but %vectfloat is used in another branch of the code + %vectint = add <8 x i32> , zeroinitializer + %vectfloat = bitcast <8 x i32> %vectint to <8 x float> + %vectcast = bitcast <8 x float> %vectfloat to <8 x i32> + call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast) +; so scalarization should happen here + %anothercast = bitcast <8 x float> %vectfloat to <8 x i32> + %v1 = extractelement <8 x i32> %anothercast, i32 3 + %v2 = extractelement <8 x i32> %anothercast, i32 5 + %v3 = add i32 %v1, %v2 + ret void +} + +define spir_kernel void @test_selective_3() { +; CHECK-LABEL: @test_selective_3( +; CHECK: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK: [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ] +; CHECK: [[DATA:%.*]] = phi <8 x i32> [ zeroinitializer, [[INIT0]] ], [ [[NEWDATA:%.*]], %[[LOOP]] ] +; CHECK: [[NEWDATA]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[DATA]]) +; CHECK: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 +; CHECK: [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 +; CHECK: br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]] +; CHECK: [[END]]: +; CHECK: ret void +; +; no scalarization happens here because the vectors %data and %newdata are used as whole + br label %loop + +loop: + %offset = phi i32 [ 0, %0 ], [ %newoffset, %loop ] + + %data = phi <8 x i32> [ zeroinitializer, %0 ], [ %newdata, %loop ] + %newdata = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> %data) + + %newoffset = add i32 %offset, 1 + %1 = icmp eq i32 %newoffset, 10 + br i1 %1, label %end, label %loop + +end: + ret void +} + +define spir_kernel void @test_selective_4(i64 %addr) #0 { +; CHECK-LABEL: @test_selective_4( +; CHECK: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK: [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ] +; CHECK: [[FLOAT_VECT:%.*]] = phi <8 x float> [ zeroinitializer, [[INIT0]] ], [ [[NEW_FLOAT_VECT:%.*]], %[[LOOP]] ] +; CHECK: [[INT_VECT:%.*]] = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 [[OFFSET]], i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0) +; CHECK: [[NEW_FLOAT_VECT]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[FLOAT_VECT]], <8 x i16> , <8 x i32> [[INT_VECT]], i32 11, i32 11, i32 8, i32 8, i1 false) +; CHECK: [[NEWOFFSET]] = add i32 [[OFFSET]], 16 +; CHECK: [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 256 +; CHECK: br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]] +; CHECK: [[END]]: +; CHECK: ret void +; +; same here: no scalarization + br label %loop + +loop: + %offset = phi i32 [ 0, %0 ], [ %newoffset, %loop ] + %float_vector = phi <8 x float> [ zeroinitializer, %0 ], [ %new_float_vector, %loop ] + %int_vector = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 %offset, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0) + %new_float_vector = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %float_vector, <8 x i16> , <8 x i32> %int_vector, i32 11, i32 11, i32 8, i32 8, i1 false) + %newoffset = add i32 %offset, 16 + %1 = icmp eq i32 %newoffset, 256 + br i1 %1, label %end, label %loop + +end: + ret void +} + + +define spir_kernel void @test_selective_5() { +; CHECK-LABEL: @test_selective_5( +; CHECK: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK: [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ] +; CHECK: [[DATA1:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR:%.*]], %[[LOOP]] ] +; CHECK: [[DATA3:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR10:%.*]], %[[LOOP]] ] +; CHECK: [[DATA4:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR11:%.*]], %[[LOOP]] ] +; CHECK: [[DATA5:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR12:%.*]], %[[LOOP]] ] +; CHECK: [[DATA6:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ] +; CHECK: [[DATA7:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ] +; CHECK: [[DATA8:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ] +; CHECK: [[DATA9:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ] +; CHECK: [[VECT:%.*]] = insertelement <4 x i32> undef, i32 [[DATA1]], i32 0 +; CHECK: [[VECT13:%.*]] = insertelement <4 x i32> [[VECT]], i32 [[DATA3]], i32 1 +; CHECK: [[VECT14:%.*]] = insertelement <4 x i32> [[VECT13]], i32 [[DATA4]], i32 2 +; CHECK: [[VECT15:%.*]] = insertelement <4 x i32> [[VECT14]], i32 [[DATA5]], i32 3 +; CHECK: [[NEWDATA:%.*]] = call <4 x i32> @do_math_v4i32_v4i32(<4 x i32> [[VECT15]]) +; CHECK: [[NEWDATA_SCALAR]] = extractelement <4 x i32> [[NEWDATA]], i32 0 +; CHECK: [[NEWDATA_SCALAR10]] = extractelement <4 x i32> [[NEWDATA]], i32 1 +; CHECK: [[NEWDATA_SCALAR11]] = extractelement <4 x i32> [[NEWDATA]], i32 2 +; CHECK: [[NEWDATA_SCALAR12]] = extractelement <4 x i32> [[NEWDATA]], i32 3 +; CHECK: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 +; CHECK: [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 +; CHECK: br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]] +; CHECK: [[END]]: +; CHECK: ret void +; +; here shufflevectors break vectorial nature of the arguments +; scalarization should be done + br label %loop + +loop: + %offset = phi i32 [ 0, %0 ], [ %newoffset, %loop ] + + %data = phi <8 x i32> [ zeroinitializer, %0 ], [ %newdata2, %loop ] + %data2 = shufflevector <8 x i32> %data, <8 x i32> undef, <4 x i32> + %newdata = call <4 x i32> @do_math_v4i32_v4i32(<4 x i32> %data2) + %newdata2 = shufflevector <4 x i32> %newdata, <4 x i32> undef, <8 x i32> + + %newoffset = add i32 %offset, 1 + %1 = icmp eq i32 %newoffset, 10 + br i1 %1, label %end, label %loop + +end: + ret void +} + +declare spir_func void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>) #1 +declare spir_func <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) #1 +declare spir_func <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1 +declare spir_func <4 x i32> @do_math_v4i32_v4i32(<4 x i32>) #1 +declare spir_func <8 x i32> @do_math_v8i32_v8i32(<8 x i32>) #1 + +attributes #0 = { convergent nounwind } +attributes #1 = { nounwind }