diff --git a/IGC/Compiler/Optimizer/Scalarizer.cpp b/IGC/Compiler/Optimizer/Scalarizer.cpp
index 36486b4d8ea7..33e4a488a3e5 100644
--- a/IGC/Compiler/Optimizer/Scalarizer.cpp
+++ b/IGC/Compiler/Optimizer/Scalarizer.cpp
@@ -23,8 +23,8 @@ SPDX-License-Identifier: MIT
 #include "common/LLVMWarningsPop.hpp"
 #include "common/igc_regkeys.hpp"
 #include "common/Types.hpp"
-#include <iostream>
 #include "Probe/Assertion.h"
+#include <vector>
 
 using namespace llvm;
 using namespace IGC;
@@ -62,6 +62,8 @@ ScalarizeFunction::ScalarizeFunction(bool selectiveScalarization) : FunctionPass
     initializeScalarizeFunctionPass(*PassRegistry::getPassRegistry());
 
     for (int i = 0; i < Instruction::OtherOpsEnd; i++) m_transposeCtr[i] = 0;
+
+    // Needs IGC_EnableSelectiveScalarizer = 1
     m_SelectiveScalarization = selectiveScalarization;
 
     // Initialize SCM buffers and allocation
@@ -70,14 +72,17 @@ ScalarizeFunction::ScalarizeFunction(bool selectiveScalarization) : FunctionPass
     m_SCMArrayLocation = 0;
 
     V_PRINT(scalarizer, "ScalarizeFunction constructor\n");
+    V_PRINT(scalarizer, "IGC_EnableSelectiveScalarizer = ");
+    V_PRINT(scalarizer, IGC_IS_FLAG_ENABLED(EnableSelectiveScalarizer));
+    V_PRINT(scalarizer, "\n");
 }
 
-ScalarizeFunction::~ScalarizeFunction()
-{
+bool ScalarizeFunction::doFinalization(llvm::Module& M) {
     releaseAllSCMEntries();
     delete[] m_SCMAllocationArray;
     destroyDummyFunc();
-    V_PRINT(scalarizer, "ScalarizeFunction destructor\n");
+    V_PRINT(scalarizer, "ScalarizeFunction doFinalization\n");
+    return true;
 }
 
 bool ScalarizeFunction::runOnFunction(Function& F)
@@ -157,7 +162,7 @@ bool ScalarizeFunction::runOnFunction(Function& F)
     for (; index != re; ++index)
     {
         // get rid of old users
-        if (Value * val = dyn_cast<Value>(*index))
+        if (Value* val = dyn_cast<Value>(*index))
         {
             UndefValue* undefVal = UndefValue::get((*index)->getType());
             (val)->replaceAllUsesWith(undefVal);
@@ -171,13 +176,18 @@ bool ScalarizeFunction::runOnFunction(Function& F)
 }
 
 /// <summary>
-/// @brief We want to avoid scalarize vector-phi node if the vector is used
+/// @brief We want to avoid scalarization of vector instructions if the vector is used
 /// as a whole entity somewhere in the program. This function tries to find
 /// this kind of definition web that involves phi-node, insert-element etc,
 /// then add them into the exclusion-set (excluded from scalarization).
 /// </summary>
 void ScalarizeFunction::buildExclusiveSet()
 {
+
+    auto isAddToWeb = [](Value* V) -> bool {
+        return isa<PHINode>(V) || isa<BitCastInst>(V);
+        };
+
     auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     for (auto dfi = df_begin(DT->getRootNode()), dfe = df_end(DT->getRootNode());
         dfi != dfe; ++dfi)
@@ -190,7 +200,10 @@ void ScalarizeFunction::buildExclusiveSet()
             Instruction* currInst = &*sI;
             ++sI;
             // find the seed for the workset
-            std::vector<llvm::Value*> workset;
+            std::vector<Value*> workset;
+
+            // Instructions that accept vectorial arguments can end legs of the web
+            // i.e. the instructions that produce the vectorial arguments may be protected from scalarization
             if (GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(currInst))
             {
                 unsigned numOperands = IGCLLVM::getNumArgOperands(GII);
@@ -203,6 +216,16 @@ void ScalarizeFunction::buildExclusiveSet()
                     }
                 }
             }
+            else if (CallInst * CI = dyn_cast<CallInst>(currInst))
+            {
+                for (auto arg = CI->arg_begin(); arg != CI->arg_end(); ++arg)
+                {
+                    if (isa<VectorType>(arg->get()->getType()))
+                    {
+                        workset.push_back(arg->get());
+                    }
+                }
+            }
             else if (auto IEI = dyn_cast<InsertElementInst>(currInst))
             {
                 Value* scalarIndexVal = IEI->getOperand(2);
@@ -219,9 +242,12 @@ void ScalarizeFunction::buildExclusiveSet()
                     workset.push_back(EEI->getOperand(0));
                 }
             }
-            // try to find a phi-web from the seed
-            bool HasPHI = false;
-            std::set<llvm::Value*> defweb;
+            else if (BitCastInst* BCI = dyn_cast<BitCastInst>(currInst))
+            {
+                workset.push_back(BCI->getOperand(0));
+            }
+            // try to find a web from the seed
+            std::set<Value*> defweb;
             while (!workset.empty())
             {
                 auto Def = workset.back();
@@ -230,70 +256,45 @@ void ScalarizeFunction::buildExclusiveSet()
                 {
                     continue;
                 }
-                if (auto IEI = dyn_cast<InsertElementInst>(Def))
-                {
-                    defweb.insert(IEI);
-                    if (!defweb.count(IEI->getOperand(0)) &&
-                        (isa<PHINode>(IEI->getOperand(0)) ||
-                            isa<ShuffleVectorInst>(IEI->getOperand(0)) ||
-                            isa<InsertElementInst>(IEI->getOperand(0))))
-                    {
-                        workset.push_back(IEI->getOperand(0));
-                    }
-                }
-                else if (auto SVI = dyn_cast<ShuffleVectorInst>(Def))
+
+                // The web grows "up" through BitCasts and PHI nodes
+                // but insert/extract elements and vector shuffles should be scalarized
+                if (!isAddToWeb(Def)) continue;
+
+                if (BitCastInst* BCI = dyn_cast<BitCastInst>(Def))
                 {
-                    defweb.insert(SVI);
-                    if (!defweb.count(SVI->getOperand(0)) &&
-                        (isa<PHINode>(SVI->getOperand(0)) ||
-                            isa<ShuffleVectorInst>(SVI->getOperand(0)) ||
-                            isa<InsertElementInst>(SVI->getOperand(0))))
-                    {
-                        workset.push_back(SVI->getOperand(0));
-                    }
-                    if (!defweb.count(SVI->getOperand(1)) &&
-                        (isa<PHINode>(SVI->getOperand(1)) ||
-                            isa<ShuffleVectorInst>(SVI->getOperand(1)) ||
-                            isa<InsertElementInst>(SVI->getOperand(1))))
+                    defweb.insert(BCI);
+                    if (!defweb.count(BCI->getOperand(0)) && isAddToWeb(BCI->getOperand(0)))
                     {
-                        workset.push_back(SVI->getOperand(1));
+                        workset.push_back(BCI->getOperand(0));
                     }
                 }
                 else if (auto PHI = dyn_cast<PHINode>(Def))
                 {
                     defweb.insert(PHI);
-                    HasPHI = true;  // !this def-web is qualified!
                     for (int i = 0, n = PHI->getNumOperands(); i < n; ++i)
-                        if (!defweb.count(PHI->getOperand(i)) &&
-                            (isa<PHINode>(PHI->getOperand(i)) ||
-                                isa<ShuffleVectorInst>(PHI->getOperand(i)) ||
-                                isa<InsertElementInst>(PHI->getOperand(i))))
+                    {
+                        if (!defweb.count(PHI->getOperand(i)) && isAddToWeb(PHI->getOperand(i)))
                         {
                             workset.push_back(PHI->getOperand(i));
                         }
+                    }
                 }
                 else
                 {
                     continue;
                 }
-                // check use
+
+                // The web grows "down" through BitCasts and PHI nodes as well
                 for (auto U : Def->users())
                 {
-                    if (!defweb.count(U) &&
-                        (isa<PHINode>(U) ||
-                            isa<ShuffleVectorInst>(U) ||
-                            isa<InsertElementInst>(U)))
+                    if (!defweb.count(U) && isAddToWeb(U))
                     {
                         workset.push_back(U);
                     }
                 }
             }
-            // if we find a qualified web with PHINode, add those instructions
-            // into the exclusion set
-            if (HasPHI)
-            {
-                m_Excludes.merge(defweb);
-            }
+            m_Excludes.merge(defweb);
         }
     }
 }
@@ -390,7 +391,7 @@ void ScalarizeFunction::recoverNonScalarizableInst(Instruction* Inst)
     if (isa<VectorType>(Inst->getType())) getSCMEntry(Inst);
 
     // Iterate over all arguments. Check that they all exist (or rebuilt)
-    if (CallInst * CI = dyn_cast<CallInst>(Inst))
+    if (CallInst* CI = dyn_cast<CallInst>(Inst))
     {
         unsigned numOperands = IGCLLVM::getNumArgOperands(CI);
         for (unsigned i = 0; i < numOperands; i++)
@@ -508,7 +509,7 @@ void ScalarizeFunction::scalarizeInstruction(BinaryOperator* BI)
             BI->getName(),
             BI
         );
-        if (BinaryOperator * BO = dyn_cast<BinaryOperator>(Val)) {
+        if (BinaryOperator* BO = dyn_cast<BinaryOperator>(Val)) {
             // Copy overflow flags if any.
             if (isa<OverflowingBinaryOperator>(BO)) {
                 BO->setHasNoSignedWrap(BI->hasNoSignedWrap());
@@ -609,7 +610,7 @@ void ScalarizeFunction::scalarizeInstruction(CastInst* CI)
         "unexpected type!");
     IGC_ASSERT_MESSAGE(
         cast<IGCLLVM::FixedVectorType>(CI->getOperand(0)->getType())
-                ->getNumElements() == numElements,
+        ->getNumElements() == numElements,
         "unexpected vector width");
 
     // Obtain scalarized argument
@@ -666,7 +667,7 @@ void ScalarizeFunction::scalarizeInstruction(PHINode* PI)
     {
         auto* Op = PI->getIncomingValue(i);
 
-        if (auto * GII = dyn_cast<GenIntrinsicInst>(Op))
+        if (auto* GII = dyn_cast<GenIntrinsicInst>(Op))
         {
             switch (GII->getIntrinsicID())
             {
@@ -694,7 +695,7 @@ void ScalarizeFunction::scalarizeInstruction(PHINode* PI)
             phis.pop_back();
             for (auto U : PN->users())
             {
-                if (GenIntrinsicInst * GII = dyn_cast<GenIntrinsicInst>(U))
+                if (GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(U))
                 {
                     switch (GII->getIntrinsicID())
                     {
@@ -703,11 +704,16 @@ void ScalarizeFunction::scalarizeInstruction(PHINode* PI)
                     case GenISAIntrinsic::GenISA_sub_group_dpas:
                     case GenISAIntrinsic::GenISA_dpas:
                     case GenISAIntrinsic::GenISA_simdBlockWrite:
+                    case GenISAIntrinsic::GenISA_simdBlockWriteBindless:
+                    case GenISAIntrinsic::GenISA_simdMediaBlockWrite:
+                    case GenISAIntrinsic::GenISA_LSC2DBlockWrite:
+                    case GenISAIntrinsic::GenISA_LSC2DBlockWriteAddrPayload:
+                    case GenISAIntrinsic::GenISA_LSCStoreBlock:
                         recoverNonScalarizableInst(PI);
                         return;
                     }
                 }
-                else if (PHINode * N = dyn_cast<PHINode>(U))
+                else if (PHINode* N = dyn_cast<PHINode>(U))
                 {
                     if (visited.count(N) == 0) {
                         visited[N] = 1;
@@ -720,7 +726,6 @@ void ScalarizeFunction::scalarizeInstruction(PHINode* PI)
         phis.clear();
     }
 
-
     // Prepare empty SCM entry for the instruction
     SCMEntry* newEntry = getSCMEntry(PI);
 
@@ -1047,7 +1052,7 @@ void ScalarizeFunction::scalarizeInstruction(GetElementPtrInst* GI)
         auto op1 = baseValue->getType()->isVectorTy() ? operand1[i] : baseValue;
         auto op2 = indexValue->getType()->isVectorTy() ? operand2[i] : indexValue;
 
-        Type *BaseTy = IGCLLVM::getNonOpaquePtrEltTy(op1->getType());
+        Type* BaseTy = IGCLLVM::getNonOpaquePtrEltTy(op1->getType());
         Value* newGEP = GetElementPtrInst::Create(BaseTy, op1, op2,
             VALUE_NAME(GI->getName()), GI);
         Value* constIndex = ConstantInt::get(Type::getInt32Ty(context()), i);
@@ -1123,7 +1128,7 @@ void ScalarizeFunction::obtainScalarizedValues(SmallVectorImpl<Value*>& retValue
             retValues[i + destIdx] = undefElement;
         }
     }
-    else if (Constant * vectorConst = dyn_cast<Constant>(origValue))
+    else if (Constant* vectorConst = dyn_cast<Constant>(origValue))
     {
         V_PRINT(scalarizer, "\t\t\tProper constant: " << *vectorConst << "\n");
         // Value is a constant. Break it down to scalars by employing a constant expression
@@ -1310,7 +1315,7 @@ void ScalarizeFunction::updateSCMEntryWithValues(ScalarizeFunction::SCMEntry* en
 
     if (matchDbgLoc)
     {
-        if (const Instruction * origInst = dyn_cast<Instruction>(origValue))
+        if (const Instruction* origInst = dyn_cast<Instruction>(origValue))
         {
             for (unsigned i = 0; i < width; ++i)
             {
@@ -1347,17 +1352,17 @@ void ScalarizeFunction::resolveDeferredInstructions()
 
     // lambda to check if a value is a dummy instruction
     auto isDummyValue = [this](Value* val)
-    {
-        auto* call = dyn_cast<CallInst>(val);
-        if (!call) return false;
-        // If the Value is one of the dummy functions that we created.
-        for (const auto& function : createdDummyFunctions) {
-            if (call->getCalledFunction() == function.second)
-                return true;
-        }
+        {
+            auto* call = dyn_cast<CallInst>(val);
+            if (!call) return false;
+            // If the Value is one of the dummy functions that we created.
+            for (const auto& function : createdDummyFunctions) {
+                if (call->getCalledFunction() == function.second)
+                    return true;
+            }
 
-        return false;
-    };
+            return false;
+        };
 
     for (auto deferredEntry = m_DRL.begin(); m_DRL.size() > 0;)
     {
@@ -1395,8 +1400,8 @@ void ScalarizeFunction::resolveDeferredInstructions()
             newInsts.resize(width);
             for (unsigned i = 0; i < width; i++)
             {
-                Value *constIndex = ConstantInt::get(Type::getInt32Ty(context()), i);
-                Instruction *EE = ExtractElementInst::Create(vectorInst, constIndex,
+                Value* constIndex = ConstantInt::get(Type::getInt32Ty(context()), i);
+                Instruction* EE = ExtractElementInst::Create(vectorInst, constIndex,
                     VALUE_NAME(vectorInst->getName() + ".scalar"), &(*insertLocation));
                 newInsts[i] = EE;
             }
@@ -1417,7 +1422,7 @@ void ScalarizeFunction::resolveDeferredInstructions()
                 // It's possible the scalar values are not resolved earlier and are themselves dummy instructions.
                 // In order to find the real value, we look in the map to see which value replaced it.
                 if (dummyToScalarMap.count(scalarVal))
-                scalarVal = dummyToScalarMap[scalarVal];
+                    scalarVal = dummyToScalarMap[scalarVal];
                 else
                     totallyResolved = false;
             }
@@ -1441,10 +1446,10 @@ void ScalarizeFunction::resolveDeferredInstructions()
         }
     }
 
-    for (const auto &entry : dummyToScalarMap)
+    for (const auto& entry : dummyToScalarMap)
     {
         // Replace and erase all dummy instructions (don't use eraseFromParent as the dummy is not in the function)
-        Instruction *dummyInst = cast<Instruction>(entry.first);
+        Instruction* dummyInst = cast<Instruction>(entry.first);
         dummyInst->replaceAllUsesWith(entry.second);
         dummyInst->deleteValue();
     }
@@ -1453,9 +1458,8 @@ void ScalarizeFunction::resolveDeferredInstructions()
     m_DRL.clear();
 }
 
-extern "C" FunctionPass* createScalarizerPass(bool selectiveScalarization)
+extern "C" FunctionPass * createScalarizerPass(bool selectiveScalarization)
 {
     return new ScalarizeFunction(selectiveScalarization);
 }
 
-
diff --git a/IGC/Compiler/Optimizer/Scalarizer.h b/IGC/Compiler/Optimizer/Scalarizer.h
index 5709f52c02fe..67887eab8332 100644
--- a/IGC/Compiler/Optimizer/Scalarizer.h
+++ b/IGC/Compiler/Optimizer/Scalarizer.h
@@ -10,11 +10,11 @@ SPDX-License-Identifier: MIT
 
 #include "common/LLVMWarningsPush.hpp"
 #include <llvm/Pass.h>
-#include "llvm/IR/Dominators.h"
+#include <llvm/IR/Dominators.h>
 #include <llvm/IR/Type.h>
 #include <llvm/IR/Instructions.h>
 #include <llvm/ADT/DenseMap.h>
-#include "llvm/ADT/MapVector.h"
+#include <llvm/ADT/MapVector.h>
 #include <llvm/ADT/ilist.h>
 #include <llvm/ADT/SetVector.h>
 #include <llvm/ADT/SmallPtrSet.h>
@@ -23,13 +23,10 @@ SPDX-License-Identifier: MIT
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/IR/Constants.h>
 #include <llvm/IR/DataLayout.h>
-#include "common/LLVMWarningsPop.hpp"
 #include <llvm/ADT/DenseSet.h>
-
-#include <string>
-#include <sstream>
+#include "common/LLVMWarningsPop.hpp"
 #include <set>
-#include <map>
+#include <utility>
 
 namespace IGC
 {
@@ -54,10 +51,9 @@ namespace IGC
     public:
         static char ID; // Pass identification, replacement for typeid
 
-        ScalarizeFunction(bool selectiveScalarization = false);
+        ScalarizeFunction(bool selectiveScalarization = true);
         ScalarizeFunction(const ScalarizeFunction&) = delete;
         ScalarizeFunction& operator=(const ScalarizeFunction&) = delete;
-        ~ScalarizeFunction();
 
         /// @brief Provides name of pass
         virtual llvm::StringRef getPassName() const override
@@ -72,12 +68,14 @@ namespace IGC
             AU.setPreservesCFG();
         }
 
+        virtual bool doFinalization(llvm::Module& M) override;
         virtual bool runOnFunction(llvm::Function& F) override;
 
     private:
 
         /// @brief select an exclusive set that would not be scalarized
         void buildExclusiveSet();
+
         /// @brief main Method for dispatching instructions (according to inst type) for scalarization
         /// @param I instruction to dispatch
         void dispatchInstructionToScalarize(llvm::Instruction* I);
@@ -190,7 +188,24 @@ namespace IGC
         inline llvm::Function* getOrCreateDummyFunc(llvm::Type* dummyType, llvm::Module* module) {
             if (createdDummyFunctions.find(dummyType) == createdDummyFunctions.end()) {
                 llvm::FunctionType* funcType = llvm::FunctionType::get(dummyType, false);
-                llvm::Function* function = llvm::Function::Create(funcType, llvm::Function::InternalLinkage, "", module);
+                // Below: change of Internal linkage to External
+                //
+                // Dummy functions are tools used by the pass and they are never defined.
+                // If any dummy functions survive, they are removed in the destructor of the pass.
+                // Thus, the change of the linkage does not impact the net effect of the pass.
+                //
+                // The change is due to the fact that erasing dummy functions in the destructor is not thread-safe.
+                // In my runs of "igc_opt" the LLVM IR code generation would begin before the destructor call.
+                // This crashes LLVM due to the presence of undefined functions.
+                //
+                // It's difficult to properly fix this bug without significant changes to the pass.
+                // Unfortunately, overriding doFinalization does not resolve the problem.
+                //
+                // By changing internal linkage to external, "real-life" compilations go as before:
+                // the destructor always gets called, as there are many other passes in the pipeline.
+                // In the testing conditions, however, the LLVM does not crash anymore,
+                // but declarations of external functions may appear in the LLVM IR.
+                llvm::Function* function = llvm::Function::Create(funcType, llvm::Function::ExternalLinkage, "", module);
                 createdDummyFunctions[dummyType] = function;
                 return function;
             }
@@ -250,5 +265,11 @@ namespace IGC
 
 } // namespace IGC
 
-/// By default (no argument given to this function), vector load/store are kept as is.
-extern "C" llvm::FunctionPass* createScalarizerPass(bool selectiveScalarization = false);
+/// @brief By default (no argument given to this function), selective scalarization is off.
+/// Selective scalarization keeps some instructions vectorized, if the vector is used as the whole entity.
+/// The pass builds a web of instructions protected from scalarization.
+/// The ending legs of the web consist of vectorial instructions such as insert and extract elements,
+/// vector shuffles, GenISA intrinsics and function calls.
+/// The vectorial instructions inside the web consist of bitcasts and PHI nodes.
+extern "C" llvm::FunctionPass * createScalarizerPass(bool selectiveScalarization = false);
+
diff --git a/IGC/Compiler/tests/ScalarizeFunction/selective.ll b/IGC/Compiler/tests/ScalarizeFunction/selective.ll
new file mode 100644
index 000000000000..e20d3511b2f2
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/selective.ll
@@ -0,0 +1,184 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2022 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; REQUIRES: regkeys
+; RUN: igc_opt --igc-scalarize -S --regkey=EnableSelectiveScalarizer=1 < %s | FileCheck %s
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; This test checks if selective scalarization leaves vectorial instructions un-scalarized.
+; ------------------------------------------------
+
+define spir_kernel void @test_selective_1(i64 %addr) #0 {
+; CHECK-LABEL: @test_selective_1(
+; CHECK:    [[VECT_INT:%.*]] = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+; CHECK:    [[VECT_FLOAT:%.*]] = bitcast <8 x i32> [[VECT_INT]] to <8 x float>
+; CHECK:    [[VECT_INT_2:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32>
+; CHECK:    call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECT_INT_2]])
+; CHECK:    ret void
+;
+
+; define a vector and do some bitcasts
+; nothing should get scalarized here
+
+    %vectint = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+    %vectfloat = bitcast <8 x i32> %vectint to <8 x float>
+    %vectcast = bitcast <8 x float> %vectfloat to <8 x i32>
+    call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast)
+
+    ret void
+}
+
+define spir_kernel void @test_selective_2(i64 %addr) #0 {
+; CHECK-LABEL: @test_selective_2(
+; CHECK:    [[VECT_INT:%.*]] = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+; CHECK:    [[VECT_FLOAT:%.*]] = bitcast <8 x i32> [[VECT_INT]] to <8 x float>
+; CHECK:    [[VECT_INT_2:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32>
+; CHECK:    call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECT_INT_2]])
+; CHECK:    [[CAST:%.*]] = bitcast <8 x float> [[VECT_FLOAT]] to <8 x i32>
+; CHECK:    [[SCALAR_0:%.*]] = extractelement <8 x i32> [[CAST]], i32 0
+; CHECK:    [[SCALAR_1:%.*]] = extractelement <8 x i32> [[CAST]], i32 1
+; CHECK:    [[SCALAR_2:%.*]] = extractelement <8 x i32> [[CAST]], i32 2
+; CHECK:    [[SCALAR_3:%.*]] = extractelement <8 x i32> [[CAST]], i32 3
+; CHECK:    [[SCALAR_4:%.*]] = extractelement <8 x i32> [[CAST]], i32 4
+; CHECK:    [[SCALAR_5:%.*]] = extractelement <8 x i32> [[CAST]], i32 5
+; CHECK:    [[SCALAR_6:%.*]] = extractelement <8 x i32> [[CAST]], i32 6
+; CHECK:    [[SCALAR_7:%.*]] = extractelement <8 x i32> [[CAST]], i32 7
+; CHECK:    [[ADD:%.*]] = add i32 [[SCALAR_3]], [[SCALAR_5]]
+; CHECK:    ret void
+;
+; same as before, but %vectfloat is used in another branch of the code
+    %vectint = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+    %vectfloat = bitcast <8 x i32> %vectint to <8 x float>
+    %vectcast = bitcast <8 x float> %vectfloat to <8 x i32>
+    call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast)
+; so scalarization should happen here
+    %anothercast = bitcast <8 x float> %vectfloat to <8 x i32>
+    %v1 = extractelement <8 x i32> %anothercast, i32 3
+    %v2 = extractelement <8 x i32> %anothercast, i32 5
+    %v3 = add i32 %v1, %v2
+    ret void
+}
+
+define spir_kernel void @test_selective_3() {
+; CHECK-LABEL: @test_selective_3(
+; CHECK:    br label %[[LOOP:.*]]
+; CHECK:    [[LOOP]]:
+; CHECK:    [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ]
+; CHECK:    [[DATA:%.*]] = phi <8 x i32> [ zeroinitializer, [[INIT0]] ], [ [[NEWDATA:%.*]], %[[LOOP]] ]
+; CHECK:    [[NEWDATA]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[DATA]])
+; CHECK:    [[NEWOFFSET]] = add i32 [[OFFSET]], 1
+; CHECK:    [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 10
+; CHECK:    br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]]
+; CHECK:    [[END]]:
+; CHECK:    ret void
+;
+; no scalarization happens here because the vectors %data and %newdata are used as whole
+  br label %loop
+
+loop:
+  %offset  = phi i32 [ 0, %0 ], [ %newoffset, %loop ]
+
+  %data    = phi <8 x i32> [ zeroinitializer, %0 ], [ %newdata, %loop ]
+  %newdata = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> %data)
+
+  %newoffset = add i32 %offset, 1
+  %1 = icmp eq i32 %newoffset, 10
+  br i1 %1, label %end, label %loop
+
+end:
+  ret void
+}
+
+define spir_kernel void @test_selective_4(i64 %addr) #0 {
+; CHECK-LABEL: @test_selective_4(
+; CHECK:    br label %[[LOOP:.*]]
+; CHECK:    [[LOOP]]:
+; CHECK:    [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ]
+; CHECK:    [[FLOAT_VECT:%.*]] = phi <8 x float> [ zeroinitializer, [[INIT0]] ], [ [[NEW_FLOAT_VECT:%.*]], %[[LOOP]] ]
+; CHECK:    [[INT_VECT:%.*]] = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 [[OFFSET]], i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0)
+; CHECK:    [[NEW_FLOAT_VECT]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[FLOAT_VECT]], <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, <8 x i32> [[INT_VECT]], i32 11, i32 11, i32 8, i32 8, i1 false)
+; CHECK:    [[NEWOFFSET]] = add i32 [[OFFSET]], 16
+; CHECK:    [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 256
+; CHECK:    br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]]
+; CHECK:    [[END]]:
+; CHECK:    ret void
+;
+; same here: no scalarization
+  br label %loop
+
+loop:
+  %offset = phi i32 [ 0, %0 ], [ %newoffset, %loop ]
+  %float_vector = phi <8 x float> [ zeroinitializer, %0 ], [ %new_float_vector, %loop ]
+  %int_vector = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 %offset, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0)
+  %new_float_vector = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %float_vector, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, <8 x i32> %int_vector, i32 11, i32 11, i32 8, i32 8, i1 false)
+  %newoffset = add i32 %offset, 16
+  %1 = icmp eq i32 %newoffset, 256
+  br i1 %1, label %end, label %loop
+
+end:
+  ret void
+}
+
+
+define spir_kernel void @test_selective_5() {
+; CHECK-LABEL: @test_selective_5(
+; CHECK:    br label %[[LOOP:.*]]
+; CHECK:    [[LOOP]]:
+; CHECK:    [[OFFSET:%.*]] = phi i32 [ 0, [[INIT0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ]
+; CHECK:    [[DATA1:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR:%.*]], %[[LOOP]] ]
+; CHECK:    [[DATA3:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR10:%.*]], %[[LOOP]] ]
+; CHECK:    [[DATA4:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR11:%.*]], %[[LOOP]] ]
+; CHECK:    [[DATA5:%.*]] = phi i32 [ 0, [[INIT0]] ], [ [[NEWDATA_SCALAR12:%.*]], %[[LOOP]] ]
+; CHECK:    [[DATA6:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ]
+; CHECK:    [[DATA7:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ]
+; CHECK:    [[DATA8:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ]
+; CHECK:    [[DATA9:%.*]] = phi i32 [ 0, [[INIT0]] ], [ undef, %[[LOOP]] ]
+; CHECK:    [[VECT:%.*]] = insertelement <4 x i32> undef, i32 [[DATA1]], i32 0
+; CHECK:    [[VECT13:%.*]] = insertelement <4 x i32> [[VECT]], i32 [[DATA3]], i32 1
+; CHECK:    [[VECT14:%.*]] = insertelement <4 x i32> [[VECT13]], i32 [[DATA4]], i32 2
+; CHECK:    [[VECT15:%.*]] = insertelement <4 x i32> [[VECT14]], i32 [[DATA5]], i32 3
+; CHECK:    [[NEWDATA:%.*]] = call <4 x i32> @do_math_v4i32_v4i32(<4 x i32> [[VECT15]])
+; CHECK:    [[NEWDATA_SCALAR]] = extractelement <4 x i32> [[NEWDATA]], i32 0
+; CHECK:    [[NEWDATA_SCALAR10]] = extractelement <4 x i32> [[NEWDATA]], i32 1
+; CHECK:    [[NEWDATA_SCALAR11]] = extractelement <4 x i32> [[NEWDATA]], i32 2
+; CHECK:    [[NEWDATA_SCALAR12]] = extractelement <4 x i32> [[NEWDATA]], i32 3
+; CHECK:    [[NEWOFFSET]] = add i32 [[OFFSET]], 1
+; CHECK:    [[CMP:%.*]] = icmp eq i32 [[NEWOFFSET]], 10
+; CHECK:    br i1 [[CMP]], label %[[END:.*]], label %[[LOOP]]
+; CHECK:    [[END]]:
+; CHECK:    ret void
+;
+; here shufflevectors break vectorial nature of the arguments
+; scalarization should be done
+  br label %loop
+
+loop:
+  %offset   = phi i32 [ 0, %0 ], [ %newoffset, %loop ]
+
+  %data     = phi <8 x i32> [ zeroinitializer, %0 ], [ %newdata2, %loop ]
+  %data2    = shufflevector <8 x i32> %data, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %newdata  = call <4 x i32> @do_math_v4i32_v4i32(<4 x i32> %data2)
+  %newdata2 = shufflevector <4 x i32> %newdata, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+
+  %newoffset = add i32 %offset, 1
+  %1 = icmp eq i32 %newoffset, 10
+  br i1 %1, label %end, label %loop
+
+end:
+  ret void
+}
+
+declare spir_func void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>) #1
+declare spir_func <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) #1
+declare spir_func <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1
+declare spir_func <4 x i32> @do_math_v4i32_v4i32(<4 x i32>) #1
+declare spir_func <8 x i32> @do_math_v8i32_v8i32(<8 x i32>) #1
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind }