From 2adb59ce3a30541e3f39606792274eef2d116d14 Mon Sep 17 00:00:00 2001
From: "Bzowski, Adam" <adam.bzowski@intel.com>
Date: Wed, 31 Jul 2024 16:38:58 +0000
Subject: [PATCH]  Fixing selective scalarization

ScalarizeFunction pass can keep some instructions vectorized, if the vector is used as the whole entity. The pass builds a web of instructions protected from scalarization. The ending legs of the web consist of vectorial instructions such as insert and extract elements, vector shuffles, GenISA intrinsics and function calls. The vectorial instructions inside the web consist of bitcasts and PHI nodes.
---
 IGC/Compiler/Optimizer/Scalarizer.cpp         | 160 +++++++--------
 IGC/Compiler/Optimizer/Scalarizer.h           |  45 +++--
 .../tests/ScalarizeFunction/selective.ll      | 184 ++++++++++++++++++
 3 files changed, 299 insertions(+), 90 deletions(-)
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/selective.ll
diff --git a/IGC/Compiler/Optimizer/Scalarizer.cpp b/IGC/Compiler/Optimizer/Scalarizer.cpp
index 36486b4d8ea7..33e4a488a3e5 100644
--- a/IGC/Compiler/Optimizer/Scalarizer.cpp
+++ b/IGC/Compiler/Optimizer/Scalarizer.cpp
@@ -23,8 +23,8 @@ SPDX-License-Identifier: MIT
 #include "common/LLVMWarningsPop.hpp"
 #include "common/igc_regkeys.hpp"
 #include "common/Types.hpp"
-#include <iostream>
 #include "Probe/Assertion.h"
+#include <vector>
 
 using namespace llvm;
 using namespace IGC;
@@ -62,6 +62,8 @@ ScalarizeFunction::ScalarizeFunction(bool selectiveScalarization) : FunctionPass
     initializeScalarizeFunctionPass(*PassRegistry::getPassRegistry());
 
     for (int i = 0; i < Instruction::OtherOpsEnd; i++) m_transposeCtr[i] = 0;
+
+    // Needs IGC_EnableSelectiveScalarizer = 1
     m_SelectiveScalarization = selectiveScalarization;
 
     // Initialize SCM buffers and allocation
@@ -70,14 +72,17 @@ ScalarizeFunction::ScalarizeFunction(bool selectiveScalarization) : FunctionPass
     m_SCMArrayLocation = 0;
 
     V_PRINT(scalarizer, "ScalarizeFunction constructor\n");
+    V_PRINT(scalarizer, "IGC_EnableSelectiveScalarizer = ");
+    V_PRINT(scalarizer, IGC_IS_FLAG_ENABLED(EnableSelectiveScalarizer));
+    V_PRINT(scalarizer, "\n");
 }
 
-ScalarizeFunction::~ScalarizeFunction()
-{
+bool ScalarizeFunction::doFinalization(llvm::Module& M) {
     releaseAllSCMEntries();
     delete[] m_SCMAllocationArray;
     destroyDummyFunc();
-    V_PRINT(scalarizer, "ScalarizeFunction destructor\n");
+    V_PRINT(scalarizer, "ScalarizeFunction doFinalization\n");
+    return true;
 }
 
 bool ScalarizeFunction::runOnFunction(Function& F)
@@ -157,7 +162,7 @@ bool ScalarizeFunction::runOnFunction(Function& F)
     for (; index != re; ++index)
     {
         // get rid of old users
-        if (Value * val = dyn_cast<Value>(*index))
+        if (Value* val = dyn_cast<Value>(*index))
         {
             UndefValue* undefVal = UndefValue::get((*index)->getType());
             (val)->replaceAllUsesWith(undefVal);
@@ -171,13 +176,18 @@ bool ScalarizeFunction::runOnFunction(Function& F)
 }
 
 /// <summary>
-/// @brief We want to avoid scalarize vector-phi node if the vector is used
+/// @brief We want to avoid scalarization of vector instructions if the vector is used
 /// as a whole entity somewhere in the program. This function tries to find
 /// this kind of definition web that involves phi-node, insert-element etc,
 /// then add them into the exclusion-set (excluded from scalarization).
 /// </summary>
 void ScalarizeFunction::buildExclusiveSet()
 {
+
+    auto isAddToWeb = [](Value* V) -> bool {
+        return isa<PHINode>(V) || isa<BitCastInst>(V);
+        };
+
     auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     for (auto dfi = df_begin(DT->getRootNode()), dfe = df_end(DT->getRootNode());
         dfi != dfe; ++dfi)
@@ -190,7 +200,10 @@ void ScalarizeFunction::buildExclusiveSet()
             Instruction* currInst = &*sI;
             ++sI;
             // find the seed for the workset
-            std::vector<llvm::Value*> workset;
+            std::vector<Value*> workset;
+
+            // Instructions that accept vectorial arguments can end legs of the web
+            // i.e. the instructions that produce the vectorial arguments may be protected from scalarization
             if (GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(currInst))
             {
                 unsigned numOperands = IGCLLVM::getNumArgOperands(GII);
@@ -203,6 +216,16 @@ void ScalarizeFunction::buildExclusiveSet()
                     }
                 }
             }
+            else if (CallInst * CI = dyn_cast<CallInst>(currInst))
+            {
+                for (auto arg = CI->arg_begin(); arg != CI->arg_end(); ++arg)
+                {
+                    if (isa<VectorType>(arg->get()->getType()))
+                    {
+                        workset.push_back(arg->get());
+                    }
+                }
+            }
             else if (auto IEI = dyn_cast<InsertElementInst>(currInst))
             {
                 Value* scalarIndexVal = IEI->getOperand(2);
@@ -219,9 +242,12 @@ void ScalarizeFunction::buildExclusiveSet()
                     workset.push_back(EEI->getOperand(0));
                 }
             }
-            // try to find a phi-web from the seed
-            bool HasPHI = false;
-            std::set<llvm::Value*> defweb;
+            else if (BitCastInst* BCI = dyn_cast<BitCastInst>(currInst))
+            {
+                workset.push_back(BCI->getOperand(0));
+            }
+            // try to find a web from the seed
+            std::set<Value*> defweb;
             while (!workset.empty())
             {
                 auto Def = workset.back();
@@ -230,70 +256,45 @@ void ScalarizeFunction::buildExclusiveSet()
                 {
                     continue;
                 }
-                if (auto IEI = dyn_cast<InsertElementInst>(Def))
-                {
-                    defweb.insert(IEI);
-                    if (!defweb.count(IEI->getOperand(0)) &&
-                        (isa<PHINode>(IEI->getOperand(0)) ||
-                            isa<ShuffleVectorInst>(IEI->getOperand(0)) ||
-                            isa<InsertElementInst>(IEI->getOperand(0))))
-                    {
-                        workset.push_back(IEI->getOperand(0));
-                    }
-                }
-                else if (auto SVI = dyn_cast<ShuffleVectorInst>(Def))
+
+                // The web grows "up" through BitCasts and PHI nodes
+                // but insert/extract elements and vector shuffles should be scalarized
+                if (!isAddToWeb(Def)) continue;
+
+                if (BitCastInst* BCI = dyn_cast<BitCastInst>(Def))
                 {
-                    defweb.insert(SVI);
-                    if (!defweb.count(SVI->getOperand(0)) &&
-                        (isa<PHINode>(SVI->getOperand(0)) ||
-                            isa<ShuffleVectorInst>(SVI->getOperand(0)) ||
-                            isa<InsertElementInst>(SVI->getOperand(0))))
-                    {
-                        workset.push_back(SVI->getOperand(0));
-                    }
-                    if (!defweb.count(SVI->getOperand(1)) &&
-                        (isa<PHINode>(SVI->getOperand(1)) ||
-                            isa<ShuffleVectorInst>(SVI->getOperand(1)) ||
-                            isa<InsertElementInst>(SVI->getOperand(1))))
+                    defweb.insert(BCI);
+                    if (!defweb.count(BCI->getOperand(0)) && isAddToWeb(BCI->getOperand(0)))
                     {
-                        workset.push_back(SVI->getOperand(1));
+                        workset.push_back(BCI->getOperand(0));
                     }
                 }
                 else if (auto PHI = dyn_cast<PHINode>(Def))
                 {
                     defweb.insert(PHI);
-                    HasPHI = true;  // !this def-web is qualified!
                     for (int i = 0, n = PHI->getNumOperands(); i < n; ++i)
-                        if (!defweb.count(PHI->getOperand(i)) &&
-                            (isa<PHINode>(PHI->getOperand(i)) ||
-                                isa<ShuffleVectorInst>(PHI->getOperand(i)) ||
-                                isa<InsertElementInst>(PHI->getOperand(i))))
+                    {
+                        if (!defweb.count(PHI->getOperand(i)) && isAddToWeb(PHI->getOperand(i)))
                         {
                             workset.push_back(PHI->getOperand(i));
                         }
+                    }
                 }
                 else
                 {
                     continue;
                 }
-                // check use
+
+                // The web grows "down" through BitCasts and PHI nodes as well
                 for (auto U : Def->users())
                 {
-                    if (!defweb.count(U) &&
-                        (isa<PHINode>(U) ||
-                            isa<ShuffleVectorInst>(U) ||
-                            isa<InsertElementInst>(U)))
+                    if (!defweb.count(U) && isAddToWeb(U))
                     {
                         workset.push_back(U);
                     }
                 }
             }
-            // if we find a qualified web with PHINode, add those instructions
-            // into the exclusion set
-            if (HasPHI)
-            {
-                m_Excludes.merge(defweb);
-            }
+            m_Excludes.merge(defweb);
         }
     }
 }
@@ -390,7 +391,7 @@ void ScalarizeFunction::recoverNonScalarizableInst(Instruction* Inst)
     if (isa<VectorType>(Inst->getType())) getSCMEntry(Inst);
 
     // Iterate over all arguments. Check that they all exist (or rebuilt)
-    if (CallInst * CI = dyn_cast<CallInst>(Inst))
+    if (CallInst* CI = dyn_cast<CallInst>(Inst))
     {
         unsigned numOperands = IGCLLVM::getNumArgOperands(CI);
         for (unsigned i = 0; i < numOperands; i++)
@@ -508,7 +509,7 @@ void ScalarizeFunction::scalarizeInstruction(BinaryOperator* BI)
             BI->getName(),
             BI
         );
-        if (BinaryOperator * BO = dyn_cast<BinaryOperator>(Val)) {
+        if (BinaryOperator* BO = dyn_cast<BinaryOperator>(Val)) {
             // Copy overflow flags if any.
             if (isa<OverflowingBinaryOperator>(BO)) {
                 BO->setHasNoSignedWrap(BI->hasNoSignedWrap());
@@ -609,7 +610,7 @@ void ScalarizeFunction::scalarizeInstruction(CastInst* CI)
         "unexpected type!");
     IGC_ASSERT_MESSAGE(
         cast<IGCLLVM::FixedVectorType>(CI->getOperand(0)->getType())
-                ->getNumElements() == numElements,
+        ->getNumElements() == numElements,
         "unexpected vector width");
 
     // Obtain scalarized argument
@@ -666,7 +667,7 @@ void ScalarizeFunction::scalarizeInstruction(PHINode* PI)
     {
         auto* Op = PI->getIncomingValue(i);
 
-        if (auto * GII = dyn_cast<GenIntrinsicInst>(Op))
+        if (auto* GII = dyn_cast<GenIntrinsicInst>(Op))
         {
             switch (GII->getIntrinsicID())
             {
@@ -694,7 +695,7 @@ void ScalarizeFunction::scalarizeInstruction(PHINode* PI)
             phis.pop_back();
             for (auto U : PN->users())
             {
-                if (GenIntrinsicInst * GII = dyn_cast<GenIntrinsicInst>(U))
+                if (GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(U))
                 {
                     switch (GII->getIntrinsicID())
                     {
@@ -703,11 +704,16 @@ void ScalarizeFunction::scalarizeInstruction(PHINode* PI)
                     case GenISAIntrinsic::GenISA_sub_group_dpas:
                     case GenISAIntrinsic::GenISA_dpas:
                     case GenISAIntrinsic::GenISA_simdBlockWrite:
+                    case GenISAIntrinsic::GenISA_simdBlockWriteBindless:
+                    case GenISAIntrinsic::GenISA_simdMediaBlockWrite:
+                    case GenISAIntrinsic::GenISA_LSC2DBlockWrite:
+                    case GenISAIntrinsic::GenISA_LSC2DBlockWriteAddrPayload:
+                    case GenISAIntrinsic::GenISA_LSCStoreBlock:
                         recoverNonScalarizableInst(PI);
                         return;
                     }
                 }
-                else if (PHINode * N = dyn_cast<PHINode>(U))
+                else if (PHINode* N = dyn_cast<PHINode>(U))
                 {
                     if (visited.count(N) == 0) {
                         visited[N] = 1;
@@ -720,7 +726,6 @@ void ScalarizeFunction::scalarizeInstruction(PHINode* PI)
         phis.clear();
     }
 
-
     // Prepare empty SCM entry for the instruction
     SCMEntry* newEntry = getSCMEntry(PI);
 
@@ -1047,7 +1052,7 @@ void ScalarizeFunction::scalarizeInstruction(GetElementPtrInst* GI)
         auto op1 = baseValue->getType()->isVectorTy() ? operand1[i] : baseValue;
         auto op2 = indexValue->getType()->isVectorTy() ? operand2[i] : indexValue;
 
-        Type *BaseTy = IGCLLVM::getNonOpaquePtrEltTy(op1->getType());
+        Type* BaseTy = IGCLLVM::getNonOpaquePtrEltTy(op1->getType());
         Value* newGEP = GetElementPtrInst::Create(BaseTy, op1, op2,
             VALUE_NAME(GI->getName()), GI);
         Value* constIndex = ConstantInt::get(Type::getInt32Ty(context()), i);
@@ -1123,7 +1128,7 @@ void ScalarizeFunction::obtainScalarizedValues(SmallVectorImpl<Value*>& retValue
             retValues[i + destIdx] = undefElement;
         }
     }
-    else if (Constant * vectorConst = dyn_cast<Constant>(origValue))
+    else if (Constant* vectorConst = dyn_cast<Constant>(origValue))
     {
         V_PRINT(scalarizer, "\t\t\tProper constant: " << *vectorConst << "\n");
         // Value is a constant. Break it down to scalars by employing a constant expression
@@ -1310,7 +1315,7 @@ void ScalarizeFunction::updateSCMEntryWithValues(ScalarizeFunction::SCMEntry* en
 
     if (matchDbgLoc)
     {
-        if (const Instruction * origInst = dyn_cast<Instruction>(origValue))
+        if (const Instruction* origInst = dyn_cast<Instruction>(origValue))
         {
             for (unsigned i = 0; i < width; ++i)
             {
@@ -1347,17 +1352,17 @@ void ScalarizeFunction::resolveDeferredInstructions()
 
     // lambda to check if a value is a dummy instruction
     auto isDummyValue = [this](Value* val)
-    {
-        auto* call = dyn_cast<CallInst>(val);
-        if (!call) return false;
-        // If the Value is one of the dummy functions that we created.
-        for (const auto& function : createdDummyFunctions) {
-            if (call->getCalledFunction() == function.second)
-                return true;
-        }
+        {
+            auto* call = dyn_cast<CallInst>(val);
+            if (!call) return false;
+            // If the Value is one of the dummy functions that we created.
+            for (const auto& function : createdDummyFunctions) {
+                if (call->getCalledFunction() == function.second)
+                    return true;
+            }
 
-        return false;
-    };
+            return false;
+        };
 
     for (auto deferredEntry = m_DRL.begin(); m_DRL.size() > 0;)
     {
@@ -1395,8 +1400,8 @@ void ScalarizeFunction::resolveDeferredInstructions()
             newInsts.resize(width);
             for (unsigned i = 0; i < width; i++)
             {
-                Value *constIndex = ConstantInt::get(Type::getInt32Ty(context()), i);
-                Instruction *EE = ExtractElementInst::Create(vectorInst, constIndex,
+                Value* constIndex = ConstantInt::get(Type::getInt32Ty(context()), i);
+                Instruction* EE = ExtractElementInst::Create(vectorInst, constIndex,
                     VALUE_NAME(vectorInst->getName() + ".scalar"), &(*insertLocation));
                 newInsts[i] = EE;
             }
@@ -1417,7 +1422,7 @@ void ScalarizeFunction::resolveDeferredInstructions()
                 // It's possible the scalar values are not resolved earlier and are themselves dummy instructions.
                 // In order to find the real value, we look in the map to see which value replaced it.
                 if (dummyToScalarMap.count(scalarVal))
-                scalarVal = dummyToScalarMap[scalarVal];
+                    scalarVal = dummyToScalarMap[scalarVal];
                 else
                     totallyResolved = false;
             }
@@ -1441,10 +1446,10 @@ void ScalarizeFunction::resolveDeferredInstructions()
         }
     }
 
-    for (const auto &entry : dummyToScalarMap)
+    for (const auto& entry : dummyToScalarMap)
     {
         // Replace and erase all dummy instructions (don't use eraseFromParent as the dummy is not in the function)
-        Instruction *dummyInst = cast<Instruction>(entry.first);
+        Instruction* dummyInst = cast<Instruction>(entry.first);
         dummyInst->replaceAllUsesWith(entry.second);
         dummyInst->deleteValue();
     }
@@ -1453,9 +1458,8 @@ void ScalarizeFunction::resolveDeferredInstructions()
     m_DRL.clear();
 }
 
-extern "C" FunctionPass* createScalarizerPass(bool selectiveScalarization)
+extern "C" FunctionPass * createScalarizerPass(bool selectiveScalarization)
 {
     return new ScalarizeFunction(selectiveScalarization);
 }
 
-
diff --git a/IGC/Compiler/Optimizer/Scalarizer.h b/IGC/Compiler/Optimizer/Scalarizer.h
index 5709f52c02fe..67887eab8332 100644
--- a/IGC/Compiler/Optimizer/Scalarizer.h
+++ b/IGC/Compiler/Optimizer/Scalarizer.h
@@ -10,11 +10,11 @@ SPDX-License-Identifier: MIT
 
 #include "common/LLVMWarningsPush.hpp"
 #include <llvm/Pass.h>
-#include "llvm/IR/Dominators.h"
+#include <llvm/IR/Dominators.h>
 #include <llvm/IR/Type.h>
 #include <llvm/IR/Instructions.h>
 #include <llvm/ADT/DenseMap.h>
-#include "llvm/ADT/MapVector.h"
+#include <llvm/ADT/MapVector.h>
 #include <llvm/ADT/ilist.h>
 #include <llvm/ADT/SetVector.h>
 #include <llvm/ADT/SmallPtrSet.h>
@@ -23,13 +23,10 @@ SPDX-License-Identifier: MIT
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/IR/Constants.h>
 #include <llvm/IR/DataLayout.h>
-#include "common/LLVMWarningsPop.hpp"
 #include <llvm/ADT/DenseSet.h>
-
-#include <string>
-#include <sstream>
+#include "common/LLVMWarningsPop.hpp"
 #include <set>
-#include <map>
+#include <utility>
 
 namespace IGC
 {
@@ -54,10 +51,9 @@ namespace IGC
     public:
         static char ID; // Pass identification, replacement for typeid
 
-        ScalarizeFunction(bool selectiveScalarization = false);
+        ScalarizeFunction(bool selectiveScalarization = true);
         ScalarizeFunction(const ScalarizeFunction&) = delete;
         ScalarizeFunction& operator=(const ScalarizeFunction&) = delete;
-        ~ScalarizeFunction();
 
         /// @brief Provides name of pass
         virtual llvm::StringRef getPassName() const override
@@ -72,12 +68,14 @@ namespace IGC
             AU.setPreservesCFG();
         }
 
+        virtual bool doFinalization(llvm::Module& M) override;
         virtual bool runOnFunction(llvm::Function& F) override;
 
     private:
 
         /// @brief select an exclusive set that would not be scalarized
         void buildExclusiveSet();
+
         /// @brief main Method for dispatching instructions (according to inst type) for scalarization
         /// @param I instruction to dispatch
         void dispatchInstructionToScalarize(llvm::Instruction* I);
@@ -190,7 +188,24 @@ namespace IGC
         inline llvm::Function* getOrCreateDummyFunc(llvm::Type* dummyType, llvm::Module* module) {
             if (createdDummyFunctions.find(dummyType) == createdDummyFunctions.end()) {
                 llvm::FunctionType* funcType = llvm::FunctionType::get(dummyType, false);
-                llvm::Function* function = llvm::Function::Create(funcType, llvm::Function::InternalLinkage, "", module);
+                // Below: change of Internal linkage to External
+                //
+                // Dummy functions are tools used by the pass and they are never defined.
+                // If any dummy functions survive, they are removed in the destructor of the pass.
+                // Thus, the change of the linkage does not impact the net effect of the pass.
+                //
+                // The change is due to the fact that erasing dummy functions in the destructor is not thread-safe.
+                // In my runs of "igc_opt" the LLVM IR code generation would begin before the destructor call.
+                // This crashes LLVM due to the presence of undefined functions.
+                //
+                // It's difficult to properly fix this bug without significant changes to the pass.
+                // Unfortunately, overriding doFinalization does not resolve the problem.
+                //
+                // By changing internal linkage to external, "real-life" compilations go as before:
+                // the destructor always gets called, as there are many other passes in the pipeline.
+                // In the testing conditions, however, the LLVM does not crash anymore,
+                // but declarations of external functions may appear in the LLVM IR.
+                llvm::Function* function = llvm::Function::Create(funcType, llvm::Function::ExternalLinkage, "", module);
                 createdDummyFunctions[dummyType] = function;
                 return function;
             }
@@ -250,5 +265,11 @@ namespace IGC
 
 } // namespace IGC
 
-/// By default (no argument given to this function), vector load/store are kept as is.
-extern "C" llvm::FunctionPass* createScalarizerPass(bool selectiveScalarization = false);
+/// @brief By default (no argument given to this function), selective scalarization is off.
+/// Selective scalarization keeps some instructions vectorized, if the vector is used as the whole entity.
+/// The pass builds a web of instructions protected from scalarization.
+/// The ending legs of the web consist of vectorial instructions such as insert and extract elements,
+/// vector shuffles, GenISA intrinsics and function calls.
+/// The vectorial instructions inside the web consist of bitcasts and PHI nodes.
+extern "C" llvm::FunctionPass * createScalarizerPass(bool selectiveScalarization = false);
+
diff --git a/IGC/Compiler/tests/ScalarizeFunction/selective.ll b/IGC/Compiler/tests/ScalarizeFunction/selective.ll
new file mode 100644
index 000000000000..e20d3511b2f2
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/selective.ll
@@ -0,0 +1,184 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2022 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; REQUIRES: regkeys
+; RUN: igc_opt --igc-scalarize -S --regkey=EnableSelectiveScalarizer=1 < %s | FileCheck %s
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; This test checks if selective scalarization leaves vectorial instructions un-scalarized.
+; ------------------------------------------------
+
+define spir_kernel void @test_selective_1(i64 %addr) #0 {
+; CHECK-LABEL: @test_selective_1(
+; CHECK:    [[VECT_INT:%.*]] = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+; CHECK:    [[VECT_FLOAT:%.*]] = bitcast <8 x i32> [[VECT_INT]] to <8 x float>
+; CHECK:    [[VECT_INT_2:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32>
+; CHECK:    call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECT_INT_2]])
+; CHECK:    ret void
+;
+
+; define a vector and do some bitcasts
+; nothing should get scalarized here
+
+    %vectint = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+    %vectfloat = bitcast <8 x i32> %vectint to <8 x float>
+    %vectcast = bitcast <8 x float> %vectfloat to <8 x i32>
+    call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast)
+
+    ret void
+}
+
+define spir_kernel void @test_selective_2(i64 %addr) #0 {
+; CHECK-LABEL: @test_selective_2(
+; CHECK:    [[VECT_INT:%.*]] = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+; CHECK:    [[VECT_FLOAT:%.*]] = bitcast <8 x i32> [[VECT_INT]] to <8 x float>
+; CHECK:    [[VECT_INT_2:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32>
+; CHECK:    call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECT_INT_2]])
+; CHECK:    [[CAST:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32>
+; CHECK:    [[SCALAR_0:%.*]] = extractelement <8 x i32> [[CAST]], i32 0
+; CHECK:    [[SCALAR_1:%.*]] = extractelement <8 x i32> [[CAST]], i32 1
+; CHECK:    [[SCALAR_2:%.*]] = extractelement <8 x i32> [[CAST]], i32 2
+; CHECK:    [[SCALAR_3:%.*]] = extractelement <8 x i32> [[CAST]], i32 3
+; CHECK:    [[SCALAR_4:%.*]] = extractelement <8 x i32> [[CAST]], i32 4
+; CHECK:    [[SCALAR_5:%.*]] = extractelement <8 x i32> [[CAST]], i32 5
+; CHECK:    [[SCALAR_6:%.*]] = extractelement <8 x i32> [[CAST]], i32 6
+; CHECK:    [[SCALAR_7:%.*]] = extractelement <8 x i32> [[CAST]], i32 7
+; CHECK:    [[ADD:%.*]] = add i32 [[SCALAR_3]], [[SCALAR_5]]
+; CHECK:    ret void
+;
+; same as before, but %vectfloat is used in another branch of the code
+    %vectint = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+    %vectfloat = bitcast <8 x i32> %vectint to <8 x float>
+    %vectcast = bitcast <8 x float> %vectfloat to <8 x i32>
+    call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast)
+; so scalarization should happen here
+    %anothercast = bitcast <8 x float> %vectfloat to <8 x i32>
+    %v1 = extractelement <8 x i32> %anothercast, i32 3
+    %v2 = extractelement <8 x i32> %anothercast, i32 5
+    %v3 = add i32 %v1, %v2
+    ret void
+}
+
+define spir_kernel void @test_selective_3() {
+; CHECK-LABEL: @test_selective_3(
+; CHECK:    br label %[[LOOP:.*]]
+; CHECK:    [[LOOP]]:
+; CHECK:    [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ]
+; CHECK:    [[DATA:%.*]] = phi <8 x i32> [ zeroinitializer, [[INIT0]] ], [ [[NEWDATA:%.*]], %[[LOOP]] ]
+; CHECK:    [[NEWDATA]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[DATA]])
+; CHECK:    [[NEWOFFSET]] = add i32 [[OFFSET]], 1
+; CHECK:    [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 10
+; CHECK:    br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]]
+; CHECK:    [[END]]:
+; CHECK:    ret void
+;
+; no scalarization happens here because the vectors %data and %newdata are used as whole
+  br label %loop
+
+loop:
+  %offset  = phi i32 [ 0, %0 ], [ %newoffset, %loop ]
+
+  %data    = phi <8 x i32> [ zeroinitializer, %0 ], [ %newdata, %loop ]
+  %newdata = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> %data)
+
+  %newoffset = add i32 %offset, 1
+  %1 = icmp eq i32 %newoffset, 10
+  br i1 %1, label %end, label %loop
+
+end:
+  ret void
+}
+
+define spir_kernel void @test_selective_4(i64 %addr) #0 {
+; CHECK-LABEL: @test_selective_4(
+; CHECK:    br label %[[LOOP:.*]]
+; CHECK:    [[LOOP]]:
+; CHECK:    [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ]
+; CHECK:    [[FLOAT_VECT:%.*]] = phi <8 x float> [ zeroinitializer, [[INIT0]] ], [ [[NEW_FLOAT_VECT:%.*]], %[[LOOP]] ]
+; CHECK:    [[INT_VECT:%.*]] = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 [[OFFSET]], i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0)
+; CHECK:    [[NEW_FLOAT_VECT]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[FLOAT_VECT]], <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, <8 x i32> [[INT_VECT]], i32 11, i32 11, i32 8, i32 8, i1 false)
+; CHECK:    [[NEWOFFSET]] = add i32 [[OFFSET]], 16
+; CHECK:    [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 256
+; CHECK:    br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]]
+; CHECK:    [[END]]:
+; CHECK:    ret void
+;
+; same here: no scalarization
+  br label %loop
+
+loop:
+  %offset = phi i32 [ 0, %0 ], [ %newoffset, %loop ]
+  %float_vector = phi <8 x float> [ zeroinitializer, %0 ], [ %new_float_vector, %loop ]
+  %int_vector = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 %offset, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0)
+  %new_float_vector = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %float_vector, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, <8 x i32> %int_vector, i32 11, i32 11, i32 8, i32 8, i1 false)
+  %newoffset = add i32 %offset, 16
+  %1 = icmp eq i32 %newoffset, 256
+  br i1 %1, label %end, label %loop
+
+end:
+  ret void
+}
+
+
+define spir_kernel void @test_selective_5() {
+; CHECK-LABEL: @test_selective_5(
+; CHECK:    br label %[[LOOP:.*]]
+; CHECK:    [[LOOP]]:
+; CHECK:    [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ]
+; CHECK:    [[DATA1:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR:%.*]], %[[LOOP]] ]
+; CHECK:    [[DATA3:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR10:%.*]], %[[LOOP]] ]
+; CHECK:    [[DATA4:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR11:%.*]], %[[LOOP]] ]
+; CHECK:    [[DATA5:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR12:%.*]], %[[LOOP]] ]
+; CHECK:    [[DATA6:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ]
+; CHECK:    [[DATA7:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ]
+; CHECK:    [[DATA8:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ]
+; CHECK:    [[DATA9:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ]
+; CHECK:    [[VECT:%.*]] = insertelement <4 x i32> undef, i32 [[DATA1]], i32 0
+; CHECK:    [[VECT13:%.*]] = insertelement <4 x i32> [[VECT]], i32 [[DATA3]], i32 1
+; CHECK:    [[VECT14:%.*]] = insertelement <4 x i32> [[VECT13]], i32 [[DATA4]], i32 2
+; CHECK:    [[VECT15:%.*]] = insertelement <4 x i32> [[VECT14]], i32 [[DATA5]], i32 3
+; CHECK:    [[NEWDATA:%.*]] = call <4 x i32> @do_math_v4i32_v4i32(<4 x i32> [[VECT15]])
+; CHECK:    [[NEWDATA_SCALAR]] = extractelement <4 x i32> [[NEWDATA]], i32 0
+; CHECK:    [[NEWDATA_SCALAR10]] = extractelement <4 x i32> [[NEWDATA]], i32 1
+; CHECK:    [[NEWDATA_SCALAR11]] = extractelement <4 x i32> [[NEWDATA]], i32 2
+; CHECK:    [[NEWDATA_SCALAR12]] = extractelement <4 x i32> [[NEWDATA]], i32 3
+; CHECK:    [[NEWOFFSET]] = add i32 [[OFFSET]], 1
+; CHECK:    [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 10
+; CHECK:    br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]]
+; CHECK:    [[END]]:
+; CHECK:    ret void
+;
+; here shufflevectors break vectorial nature of the arguments
+; scalarization should be done
+  br label %loop
+
+loop:
+  %offset   = phi i32 [ 0, %0 ], [ %newoffset, %loop ]
+
+  %data     = phi <8 x i32> [ zeroinitializer, %0 ], [ %newdata2, %loop ]
+  %data2    = shufflevector <8 x i32> %data, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %newdata  = call <4 x i32> @do_math_v4i32_v4i32(<4 x i32> %data2)
+  %newdata2 = shufflevector <4 x i32> %newdata, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+
+  %newoffset = add i32 %offset, 1
+  %1 = icmp eq i32 %newoffset, 10
+  br i1 %1, label %end, label %loop
+
+end:
+  ret void
+}
+
+declare spir_func void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>) #1
+declare spir_func <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) #1
+declare spir_func <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1
+declare spir_func <4 x i32> @do_math_v4i32_v4i32(<4 x i32>) #1
+declare spir_func <8 x i32> @do_math_v8i32_v8i32(<8 x i32>) #1
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind }