From 2adb59ce3a30541e3f39606792274eef2d116d14 Mon Sep 17 00:00:00 2001 From: "Bzowski, Adam" Date: Wed, 31 Jul 2024 16:38:58 +0000 Subject: [PATCH] Fixing selective scalarization ScalarizeFunction pass can keep some instructions vectorized, if the vector is used as the whole entity. The pass builds a web of instructions protected from scalarization. The ending legs of the web consist of vectorial instructions such as insert and extract elements, vector shuffles, GenISA intrinsics and function calls. The vectorial instructions inside the web consist of bitcasts and PHI nodes. --- IGC/Compiler/Optimizer/Scalarizer.cpp | 160 +++++++-------- IGC/Compiler/Optimizer/Scalarizer.h | 45 +++-- .../tests/ScalarizeFunction/selective.ll | 184 ++++++++++++++++++ 3 files changed, 299 insertions(+), 90 deletions(-) create mode 100644 IGC/Compiler/tests/ScalarizeFunction/selective.ll diff --git a/IGC/Compiler/Optimizer/Scalarizer.cpp b/IGC/Compiler/Optimizer/Scalarizer.cpp index 36486b4d8ea7..33e4a488a3e5 100644 --- a/IGC/Compiler/Optimizer/Scalarizer.cpp +++ b/IGC/Compiler/Optimizer/Scalarizer.cpp @@ -23,8 +23,8 @@ SPDX-License-Identifier: MIT #include "common/LLVMWarningsPop.hpp" #include "common/igc_regkeys.hpp" #include "common/Types.hpp" -#include #include "Probe/Assertion.h" +#include using namespace llvm; using namespace IGC; @@ -62,6 +62,8 @@ ScalarizeFunction::ScalarizeFunction(bool selectiveScalarization) : FunctionPass initializeScalarizeFunctionPass(*PassRegistry::getPassRegistry()); for (int i = 0; i < Instruction::OtherOpsEnd; i++) m_transposeCtr[i] = 0; + + // Needs IGC_EnableSelectiveScalarizer = 1 m_SelectiveScalarization = selectiveScalarization; // Initialize SCM buffers and allocation @@ -70,14 +72,17 @@ ScalarizeFunction::ScalarizeFunction(bool selectiveScalarization) : FunctionPass m_SCMArrayLocation = 0; V_PRINT(scalarizer, "ScalarizeFunction constructor\n"); + V_PRINT(scalarizer, "IGC_EnableSelectiveScalarizer = "); + V_PRINT(scalarizer, IGC_IS_FLAG_ENABLED(EnableSelectiveScalarizer)); + V_PRINT(scalarizer, "\n"); } -ScalarizeFunction::~ScalarizeFunction() -{ +bool ScalarizeFunction::doFinalization(llvm::Module& M) { releaseAllSCMEntries(); delete[] m_SCMAllocationArray; destroyDummyFunc(); - V_PRINT(scalarizer, "ScalarizeFunction destructor\n"); + V_PRINT(scalarizer, "ScalarizeFunction doFinalization\n"); + return true; } bool ScalarizeFunction::runOnFunction(Function& F) @@ -157,7 +162,7 @@ bool ScalarizeFunction::runOnFunction(Function& F) for (; index != re; ++index) { // get rid of old users - if (Value * val = dyn_cast(*index)) + if (Value* val = dyn_cast(*index)) { UndefValue* undefVal = UndefValue::get((*index)->getType()); (val)->replaceAllUsesWith(undefVal); @@ -171,13 +176,18 @@ bool ScalarizeFunction::runOnFunction(Function& F) } /// -/// @brief We want to avoid scalarize vector-phi node if the vector is used +/// @brief We want to avoid scalarization of vector instructions if the vector is used /// as a whole entity somewhere in the program. This function tries to find /// this kind of definition web that involves phi-node, insert-element etc, /// then add them into the exclusion-set (excluded from scalarization). /// void ScalarizeFunction::buildExclusiveSet() { + + auto isAddToWeb = [](Value* V) -> bool { + return isa(V) || isa(V); + }; + auto DT = &getAnalysis().getDomTree(); for (auto dfi = df_begin(DT->getRootNode()), dfe = df_end(DT->getRootNode()); dfi != dfe; ++dfi) @@ -190,7 +200,10 @@ void ScalarizeFunction::buildExclusiveSet() Instruction* currInst = &*sI; ++sI; // find the seed for the workset - std::vector workset; + std::vector workset; + + // Instructions that accept vectorial arguments can end legs of the web + // i.e. the instructions that produce the vectorial arguments may be protected from scalarization if (GenIntrinsicInst* GII = dyn_cast(currInst)) { unsigned numOperands = IGCLLVM::getNumArgOperands(GII); @@ -203,6 +216,16 @@ void ScalarizeFunction::buildExclusiveSet() } } } + else if (CallInst * CI = dyn_cast(currInst)) + { + for (auto arg = CI->arg_begin(); arg != CI->arg_end(); ++arg) + { + if (isa(arg->get()->getType())) + { + workset.push_back(arg->get()); + } + } + } else if (auto IEI = dyn_cast(currInst)) { Value* scalarIndexVal = IEI->getOperand(2); @@ -219,9 +242,12 @@ void ScalarizeFunction::buildExclusiveSet() workset.push_back(EEI->getOperand(0)); } } - // try to find a phi-web from the seed - bool HasPHI = false; - std::set defweb; + else if (BitCastInst* BCI = dyn_cast(currInst)) + { + workset.push_back(BCI->getOperand(0)); + } + // try to find a web from the seed + std::set defweb; while (!workset.empty()) { auto Def = workset.back(); @@ -230,70 +256,45 @@ void ScalarizeFunction::buildExclusiveSet() { continue; } - if (auto IEI = dyn_cast(Def)) - { - defweb.insert(IEI); - if (!defweb.count(IEI->getOperand(0)) && - (isa(IEI->getOperand(0)) || - isa(IEI->getOperand(0)) || - isa(IEI->getOperand(0)))) - { - workset.push_back(IEI->getOperand(0)); - } - } - else if (auto SVI = dyn_cast(Def)) + + // The web grows "up" through BitCasts and PHI nodes + // but insert/extract elements and vector shuffles should be scalarized + if (!isAddToWeb(Def)) continue; + + if (BitCastInst* BCI = dyn_cast(Def)) { - defweb.insert(SVI); - if (!defweb.count(SVI->getOperand(0)) && - (isa(SVI->getOperand(0)) || - isa(SVI->getOperand(0)) || - isa(SVI->getOperand(0)))) - { - workset.push_back(SVI->getOperand(0)); - } - if (!defweb.count(SVI->getOperand(1)) && - (isa(SVI->getOperand(1)) || - isa(SVI->getOperand(1)) || - isa(SVI->getOperand(1)))) + defweb.insert(BCI); + if (!defweb.count(BCI->getOperand(0)) && isAddToWeb(BCI->getOperand(0))) { - workset.push_back(SVI->getOperand(1)); + workset.push_back(BCI->getOperand(0)); } } else if (auto PHI = dyn_cast(Def)) { defweb.insert(PHI); - HasPHI = true; // !this def-web is qualified! for (int i = 0, n = PHI->getNumOperands(); i < n; ++i) - if (!defweb.count(PHI->getOperand(i)) && - (isa(PHI->getOperand(i)) || - isa(PHI->getOperand(i)) || - isa(PHI->getOperand(i)))) + { + if (!defweb.count(PHI->getOperand(i)) && isAddToWeb(PHI->getOperand(i))) { workset.push_back(PHI->getOperand(i)); } + } } else { continue; } - // check use + + // The web grows "down" through BitCasts and PHI nodes as well for (auto U : Def->users()) { - if (!defweb.count(U) && - (isa(U) || - isa(U) || - isa(U))) + if (!defweb.count(U) && isAddToWeb(U)) { workset.push_back(U); } } } - // if we find a qualified web with PHINode, add those instructions - // into the exclusion set - if (HasPHI) - { - m_Excludes.merge(defweb); - } + m_Excludes.merge(defweb); } } } @@ -390,7 +391,7 @@ void ScalarizeFunction::recoverNonScalarizableInst(Instruction* Inst) if (isa(Inst->getType())) getSCMEntry(Inst); // Iterate over all arguments. Check that they all exist (or rebuilt) - if (CallInst * CI = dyn_cast(Inst)) + if (CallInst* CI = dyn_cast(Inst)) { unsigned numOperands = IGCLLVM::getNumArgOperands(CI); for (unsigned i = 0; i < numOperands; i++) @@ -508,7 +509,7 @@ void ScalarizeFunction::scalarizeInstruction(BinaryOperator* BI) BI->getName(), BI ); - if (BinaryOperator * BO = dyn_cast(Val)) { + if (BinaryOperator* BO = dyn_cast(Val)) { // Copy overflow flags if any. if (isa(BO)) { BO->setHasNoSignedWrap(BI->hasNoSignedWrap()); @@ -609,7 +610,7 @@ void ScalarizeFunction::scalarizeInstruction(CastInst* CI) "unexpected type!"); IGC_ASSERT_MESSAGE( cast(CI->getOperand(0)->getType()) - ->getNumElements() == numElements, + ->getNumElements() == numElements, "unexpected vector width"); // Obtain scalarized argument @@ -666,7 +667,7 @@ void ScalarizeFunction::scalarizeInstruction(PHINode* PI) { auto* Op = PI->getIncomingValue(i); - if (auto * GII = dyn_cast(Op)) + if (auto* GII = dyn_cast(Op)) { switch (GII->getIntrinsicID()) { @@ -694,7 +695,7 @@ void ScalarizeFunction::scalarizeInstruction(PHINode* PI) phis.pop_back(); for (auto U : PN->users()) { - if (GenIntrinsicInst * GII = dyn_cast(U)) + if (GenIntrinsicInst* GII = dyn_cast(U)) { switch (GII->getIntrinsicID()) { @@ -703,11 +704,16 @@ void ScalarizeFunction::scalarizeInstruction(PHINode* PI) case GenISAIntrinsic::GenISA_sub_group_dpas: case GenISAIntrinsic::GenISA_dpas: case GenISAIntrinsic::GenISA_simdBlockWrite: + case GenISAIntrinsic::GenISA_simdBlockWriteBindless: + case GenISAIntrinsic::GenISA_simdMediaBlockWrite: + case GenISAIntrinsic::GenISA_LSC2DBlockWrite: + case GenISAIntrinsic::GenISA_LSC2DBlockWriteAddrPayload: + case GenISAIntrinsic::GenISA_LSCStoreBlock: recoverNonScalarizableInst(PI); return; } } - else if (PHINode * N = dyn_cast(U)) + else if (PHINode* N = dyn_cast(U)) { if (visited.count(N) == 0) { visited[N] = 1; @@ -720,7 +726,6 @@ void ScalarizeFunction::scalarizeInstruction(PHINode* PI) phis.clear(); } - // Prepare empty SCM entry for the instruction SCMEntry* newEntry = getSCMEntry(PI); @@ -1047,7 +1052,7 @@ void ScalarizeFunction::scalarizeInstruction(GetElementPtrInst* GI) auto op1 = baseValue->getType()->isVectorTy() ? operand1[i] : baseValue; auto op2 = indexValue->getType()->isVectorTy() ? operand2[i] : indexValue; - Type *BaseTy = IGCLLVM::getNonOpaquePtrEltTy(op1->getType()); + Type* BaseTy = IGCLLVM::getNonOpaquePtrEltTy(op1->getType()); Value* newGEP = GetElementPtrInst::Create(BaseTy, op1, op2, VALUE_NAME(GI->getName()), GI); Value* constIndex = ConstantInt::get(Type::getInt32Ty(context()), i); @@ -1123,7 +1128,7 @@ void ScalarizeFunction::obtainScalarizedValues(SmallVectorImpl& retValue retValues[i + destIdx] = undefElement; } } - else if (Constant * vectorConst = dyn_cast(origValue)) + else if (Constant* vectorConst = dyn_cast(origValue)) { V_PRINT(scalarizer, "\t\t\tProper constant: " << *vectorConst << "\n"); // Value is a constant. Break it down to scalars by employing a constant expression @@ -1310,7 +1315,7 @@ void ScalarizeFunction::updateSCMEntryWithValues(ScalarizeFunction::SCMEntry* en if (matchDbgLoc) { - if (const Instruction * origInst = dyn_cast(origValue)) + if (const Instruction* origInst = dyn_cast(origValue)) { for (unsigned i = 0; i < width; ++i) { @@ -1347,17 +1352,17 @@ void ScalarizeFunction::resolveDeferredInstructions() // lambda to check if a value is a dummy instruction auto isDummyValue = [this](Value* val) - { - auto* call = dyn_cast(val); - if (!call) return false; - // If the Value is one of the dummy functions that we created. - for (const auto& function : createdDummyFunctions) { - if (call->getCalledFunction() == function.second) - return true; - } + { + auto* call = dyn_cast(val); + if (!call) return false; + // If the Value is one of the dummy functions that we created. + for (const auto& function : createdDummyFunctions) { + if (call->getCalledFunction() == function.second) + return true; + } - return false; - }; + return false; + }; for (auto deferredEntry = m_DRL.begin(); m_DRL.size() > 0;) { @@ -1395,8 +1400,8 @@ void ScalarizeFunction::resolveDeferredInstructions() newInsts.resize(width); for (unsigned i = 0; i < width; i++) { - Value *constIndex = ConstantInt::get(Type::getInt32Ty(context()), i); - Instruction *EE = ExtractElementInst::Create(vectorInst, constIndex, + Value* constIndex = ConstantInt::get(Type::getInt32Ty(context()), i); + Instruction* EE = ExtractElementInst::Create(vectorInst, constIndex, VALUE_NAME(vectorInst->getName() + ".scalar"), &(*insertLocation)); newInsts[i] = EE; } @@ -1417,7 +1422,7 @@ void ScalarizeFunction::resolveDeferredInstructions() // It's possible the scalar values are not resolved earlier and are themselves dummy instructions. // In order to find the real value, we look in the map to see which value replaced it. if (dummyToScalarMap.count(scalarVal)) - scalarVal = dummyToScalarMap[scalarVal]; + scalarVal = dummyToScalarMap[scalarVal]; else totallyResolved = false; } @@ -1441,10 +1446,10 @@ void ScalarizeFunction::resolveDeferredInstructions() } } - for (const auto &entry : dummyToScalarMap) + for (const auto& entry : dummyToScalarMap) { // Replace and erase all dummy instructions (don't use eraseFromParent as the dummy is not in the function) - Instruction *dummyInst = cast(entry.first); + Instruction* dummyInst = cast(entry.first); dummyInst->replaceAllUsesWith(entry.second); dummyInst->deleteValue(); } @@ -1453,9 +1458,8 @@ void ScalarizeFunction::resolveDeferredInstructions() m_DRL.clear(); } -extern "C" FunctionPass* createScalarizerPass(bool selectiveScalarization) +extern "C" FunctionPass * createScalarizerPass(bool selectiveScalarization) { return new ScalarizeFunction(selectiveScalarization); } - diff --git a/IGC/Compiler/Optimizer/Scalarizer.h b/IGC/Compiler/Optimizer/Scalarizer.h index 5709f52c02fe..67887eab8332 100644 --- a/IGC/Compiler/Optimizer/Scalarizer.h +++ b/IGC/Compiler/Optimizer/Scalarizer.h @@ -10,11 +10,11 @@ SPDX-License-Identifier: MIT #include "common/LLVMWarningsPush.hpp" #include -#include "llvm/IR/Dominators.h" +#include #include #include #include -#include "llvm/ADT/MapVector.h" +#include #include #include #include @@ -23,13 +23,10 @@ SPDX-License-Identifier: MIT #include #include #include -#include "common/LLVMWarningsPop.hpp" #include - -#include -#include +#include "common/LLVMWarningsPop.hpp" #include -#include +#include namespace IGC { @@ -54,10 +51,9 @@ namespace IGC public: static char ID; // Pass identification, replacement for typeid - ScalarizeFunction(bool selectiveScalarization = false); + ScalarizeFunction(bool selectiveScalarization = true); ScalarizeFunction(const ScalarizeFunction&) = delete; ScalarizeFunction& operator=(const ScalarizeFunction&) = delete; - ~ScalarizeFunction(); /// @brief Provides name of pass virtual llvm::StringRef getPassName() const override @@ -72,12 +68,14 @@ namespace IGC AU.setPreservesCFG(); } + virtual bool doFinalization(llvm::Module& M) override; virtual bool runOnFunction(llvm::Function& F) override; private: /// @brief select an exclusive set that would not be scalarized void buildExclusiveSet(); + /// @brief main Method for dispatching instructions (according to inst type) for scalarization /// @param I instruction to dispatch void dispatchInstructionToScalarize(llvm::Instruction* I); @@ -190,7 +188,24 @@ namespace IGC inline llvm::Function* getOrCreateDummyFunc(llvm::Type* dummyType, llvm::Module* module) { if (createdDummyFunctions.find(dummyType) == createdDummyFunctions.end()) { llvm::FunctionType* funcType = llvm::FunctionType::get(dummyType, false); - llvm::Function* function = llvm::Function::Create(funcType, llvm::Function::InternalLinkage, "", module); + // Below: change of Internal linkage to External + // + // Dummy functions are tools used by the pass and they are never defined. + // If any dummy functions survive, they are removed in the destructor of the pass. + // Thus, the change of the linkage does not impact the net effect of the pass. + // + // The change is due to the fact that erasing dummy functions in the destructor is not thread-safe. + // In my runs of "igc_opt" the LLVM IR code generation would begin before the destructor call. + // This crashes LLVM due to the presence of undefined functions. + // + // It's difficult to properly fix this bug without significant changes to the pass. + // Unfortunately, overriding doFinalization does not resolve the problem. + // + // By changing internal linkage to external, "real-life" compilations go as before: + // the destructor always gets called, as there are many other passes in the pipeline. + // In the testing conditions, however, the LLVM does not crash anymore, + // but declarations of external functions may appear in the LLVM IR. + llvm::Function* function = llvm::Function::Create(funcType, llvm::Function::ExternalLinkage, "", module); createdDummyFunctions[dummyType] = function; return function; } @@ -250,5 +265,11 @@ namespace IGC } // namespace IGC -/// By default (no argument given to this function), vector load/store are kept as is. -extern "C" llvm::FunctionPass* createScalarizerPass(bool selectiveScalarization = false); +/// @brief By default (no argument given to this function), selective scalarization is off. +/// Selective scalarization keeps some instructions vectorized, if the vector is used as the whole entity. +/// The pass builds a web of instructions protected from scalarization. +/// The ending legs of the web consist of vectorial instructions such as insert and extract elements, +/// vector shuffles, GenISA intrinsics and function calls. +/// The vectorial instructions inside the web consist of bitcasts and PHI nodes. +extern "C" llvm::FunctionPass * createScalarizerPass(bool selectiveScalarization = false); + diff --git a/IGC/Compiler/tests/ScalarizeFunction/selective.ll b/IGC/Compiler/tests/ScalarizeFunction/selective.ll new file mode 100644 index 000000000000..e20d3511b2f2 --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/selective.ll @@ -0,0 +1,184 @@ +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2022 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; REQUIRES: regkeys +; RUN: igc_opt --igc-scalarize -S --regkey=EnableSelectiveScalarizer=1 < %s | FileCheck %s +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; This test checks if selective scalarization leaves vectorial instructions un-scalarized. +; ------------------------------------------------ + +define spir_kernel void @test_selective_1(i64 %addr) #0 { +; CHECK-LABEL: @test_selective_1( +; CHECK: [[VECT_INT:%.*]] = add <8 x i32> , zeroinitializer +; CHECK: [[VECT_FLOAT:%.*]] = bitcast <8 x i32> [[VECT_INT]] to <8 x float> +; CHECK: [[VECT_INT_2:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32> +; CHECK: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECT_INT_2]]) +; CHECK: ret void +; + +; define a vector and do some bitcasts +; nothing should get scalarized here + + %vectint = add <8 x i32> , zeroinitializer + %vectfloat = bitcast <8 x i32> %vectint to <8 x float> + %vectcast = bitcast <8 x float> %vectfloat to <8 x i32> + call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast) + + ret void +} + +define spir_kernel void @test_selective_2(i64 %addr) #0 { +; CHECK-LABEL: @test_selective_2( +; CHECK: [[VECT_INT:%.*]] = add <8 x i32> , zeroinitializer +; CHECK: [[VECT_FLOAT:%.*]] = bitcast <8 x i32> [[VECT_INT]] to <8 x float> +; CHECK: [[VECT_INT_2:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32> +; CHECK: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECT_INT_2]]) +; CHECK: [[CAST:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32> +; CHECK: [[SCALAR_0:%.*]] = extractelement <8 x i32> [[CAST]], i32 0 +; CHECK: [[SCALAR_1:%.*]] = extractelement <8 x i32> [[CAST]], i32 1 +; CHECK: [[SCALAR_2:%.*]] = extractelement <8 x i32> [[CAST]], i32 2 +; CHECK: [[SCALAR_3:%.*]] = extractelement <8 x i32> [[CAST]], i32 3 +; CHECK: [[SCALAR_4:%.*]] = extractelement <8 x i32> [[CAST]], i32 4 +; CHECK: [[SCALAR_5:%.*]] = extractelement <8 x i32> [[CAST]], i32 5 +; CHECK: [[SCALAR_6:%.*]] = extractelement <8 x i32> [[CAST]], i32 6 +; CHECK: [[SCALAR_7:%.*]] = extractelement <8 x i32> [[CAST]], i32 7 +; CHECK: [[ADD:%.*]] = add i32 [[SCALAR_3]], [[SCALAR_5]] +; CHECK: ret void +; +; same as before, but %vectfloat is used in another branch of the code + %vectint = add <8 x i32> , zeroinitializer + %vectfloat = bitcast <8 x i32> %vectint to <8 x float> + %vectcast = bitcast <8 x float> %vectfloat to <8 x i32> + call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast) +; so scalarization should happen here + %anothercast = bitcast <8 x float> %vectfloat to <8 x i32> + %v1 = extractelement <8 x i32> %anothercast, i32 3 + %v2 = extractelement <8 x i32> %anothercast, i32 5 + %v3 = add i32 %v1, %v2 + ret void +} + +define spir_kernel void @test_selective_3() { +; CHECK-LABEL: @test_selective_3( +; CHECK: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK: [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ] +; CHECK: [[DATA:%.*]] = phi <8 x i32> [ zeroinitializer, [[INIT0]] ], [ [[NEWDATA:%.*]], %[[LOOP]] ] +; CHECK: [[NEWDATA]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[DATA]]) +; CHECK: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 +; CHECK: [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 +; CHECK: br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]] +; CHECK: [[END]]: +; CHECK: ret void +; +; no scalarization happens here because the vectors %data and %newdata are used as whole + br label %loop + +loop: + %offset = phi i32 [ 0, %0 ], [ %newoffset, %loop ] + + %data = phi <8 x i32> [ zeroinitializer, %0 ], [ %newdata, %loop ] + %newdata = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> %data) + + %newoffset = add i32 %offset, 1 + %1 = icmp eq i32 %newoffset, 10 + br i1 %1, label %end, label %loop + +end: + ret void +} + +define spir_kernel void @test_selective_4(i64 %addr) #0 { +; CHECK-LABEL: @test_selective_4( +; CHECK: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK: [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ] +; CHECK: [[FLOAT_VECT:%.*]] = phi <8 x float> [ zeroinitializer, [[INIT0]] ], [ [[NEW_FLOAT_VECT:%.*]], %[[LOOP]] ] +; CHECK: [[INT_VECT:%.*]] = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 [[OFFSET]], i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0) +; CHECK: [[NEW_FLOAT_VECT]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[FLOAT_VECT]], <8 x i16> , <8 x i32> [[INT_VECT]], i32 11, i32 11, i32 8, i32 8, i1 false) +; CHECK: [[NEWOFFSET]] = add i32 [[OFFSET]], 16 +; CHECK: [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 256 +; CHECK: br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]] +; CHECK: [[END]]: +; CHECK: ret void +; +; same here: no scalarization + br label %loop + +loop: + %offset = phi i32 [ 0, %0 ], [ %newoffset, %loop ] + %float_vector = phi <8 x float> [ zeroinitializer, %0 ], [ %new_float_vector, %loop ] + %int_vector = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 %offset, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0) + %new_float_vector = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %float_vector, <8 x i16> , <8 x i32> %int_vector, i32 11, i32 11, i32 8, i32 8, i1 false) + %newoffset = add i32 %offset, 16 + %1 = icmp eq i32 %newoffset, 256 + br i1 %1, label %end, label %loop + +end: + ret void +} + + +define spir_kernel void @test_selective_5() { +; CHECK-LABEL: @test_selective_5( +; CHECK: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK: [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ] +; CHECK: [[DATA1:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR:%.*]], %[[LOOP]] ] +; CHECK: [[DATA3:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR10:%.*]], %[[LOOP]] ] +; CHECK: [[DATA4:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR11:%.*]], %[[LOOP]] ] +; CHECK: [[DATA5:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR12:%.*]], %[[LOOP]] ] +; CHECK: [[DATA6:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ] +; CHECK: [[DATA7:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ] +; CHECK: [[DATA8:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ] +; CHECK: [[DATA9:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ] +; CHECK: [[VECT:%.*]] = insertelement <4 x i32> undef, i32 [[DATA1]], i32 0 +; CHECK: [[VECT13:%.*]] = insertelement <4 x i32> [[VECT]], i32 [[DATA3]], i32 1 +; CHECK: [[VECT14:%.*]] = insertelement <4 x i32> [[VECT13]], i32 [[DATA4]], i32 2 +; CHECK: [[VECT15:%.*]] = insertelement <4 x i32> [[VECT14]], i32 [[DATA5]], i32 3 +; CHECK: [[NEWDATA:%.*]] = call <4 x i32> @do_math_v4i32_v4i32(<4 x i32> [[VECT15]]) +; CHECK: [[NEWDATA_SCALAR]] = extractelement <4 x i32> [[NEWDATA]], i32 0 +; CHECK: [[NEWDATA_SCALAR10]] = extractelement <4 x i32> [[NEWDATA]], i32 1 +; CHECK: [[NEWDATA_SCALAR11]] = extractelement <4 x i32> [[NEWDATA]], i32 2 +; CHECK: [[NEWDATA_SCALAR12]] = extractelement <4 x i32> [[NEWDATA]], i32 3 +; CHECK: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 +; CHECK: [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 +; CHECK: br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]] +; CHECK: [[END]]: +; CHECK: ret void +; +; here shufflevectors break vectorial nature of the arguments +; scalarization should be done + br label %loop + +loop: + %offset = phi i32 [ 0, %0 ], [ %newoffset, %loop ] + + %data = phi <8 x i32> [ zeroinitializer, %0 ], [ %newdata2, %loop ] + %data2 = shufflevector <8 x i32> %data, <8 x i32> undef, <4 x i32> + %newdata = call <4 x i32> @do_math_v4i32_v4i32(<4 x i32> %data2) + %newdata2 = shufflevector <4 x i32> %newdata, <4 x i32> undef, <8 x i32> + + %newoffset = add i32 %offset, 1 + %1 = icmp eq i32 %newoffset, 10 + br i1 %1, label %end, label %loop + +end: + ret void +} + +declare spir_func void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>) #1 +declare spir_func <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) #1 +declare spir_func <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1 +declare spir_func <4 x i32> @do_math_v4i32_v4i32(<4 x i32>) #1 +declare spir_func <8 x i32> @do_math_v8i32_v8i32(<8 x i32>) #1 + +attributes #0 = { convergent nounwind } +attributes #1 = { nounwind }