diff --git a/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.cpp b/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.cpp
index 56dcd0c8c2d6..7e0fb482989f 100644
--- a/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.cpp
+++ b/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.cpp
@@ -12,7 +12,10 @@ SPDX-License-Identifier: MIT
 #include "Compiler/IGCPassSupport.h"
 #include "Compiler/CISACodeGen/helper.h"
 #include "common/LLVMWarningsPush.hpp"
+#include <llvm/Transforms/Utils/Cloning.h>
 #include <llvm/IR/Function.h>
+#include "llvm/IR/Verifier.h"
+#include <llvmWrapper/IR/PatternMatch.h>
 #include "common/LLVMWarningsPop.hpp"
 #include "GenerateBlockMemOpsPass.hpp"
 #include "IGCIRBuilder.h"
@@ -32,7 +35,7 @@ IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis)
 IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
 IGC_INITIALIZE_PASS_END(GenerateBlockMemOpsPass, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
 
-const uint MaxSgSize = 32;
+const size_t MaxSgSize = 32;
 
 GenerateBlockMemOpsPass::GenerateBlockMemOpsPass() : FunctionPass(ID) {
     initializeGenerateBlockMemOpsPassPass(*PassRegistry::getPassRegistry());
@@ -43,34 +46,472 @@ bool GenerateBlockMemOpsPass::runOnFunction(Function &F) {
         return false;
 
     bool Changed = false;
-    SmallVector<llvm::Instruction*, 32> LoadStoreToProcess;
+
+    // Load / store instructions which are not in code divergence and can be optimized.
+    SmallVector<Instruction*, 32> LoadStoreToProcess;
+    // Load / store instructions which are inside the loop and can be optimized.
+    DenseMap<Loop*, SmallVector<Instruction*, 32>> LoadStoreInLoop;
 
     MdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
     CGCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
+    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    WI = &getAnalysis<WIAnalysis>();
 
     IGCMD::FunctionInfoMetaDataHandle Info = MdUtils->getFunctionsInfoItem(&F);
     if (Info->getType() != FunctionTypeMD::KernelFunction)
         return false;
 
+    // If the subgroup size is not specified, then the maximum subgroup size is used.
+    IGC::IGCMD::FunctionInfoMetaDataHandle FuncInfoMD = MdUtils->getFunctionsInfoItem(&F);
+    IGC::IGCMD::SubGroupSizeMetaDataHandle SubGroupSize = FuncInfoMD->getSubGroupSize();
+    if (SubGroupSize->hasValue()) {
+        SimdSize = SubGroupSize->getSIMDSize();
+    } else {
+        SimdSize = MaxSgSize;
+    }
+
     // Check that workgroups have been scalarized along the x-axis.
     if (!checkVectorizationAlongX(&F))
         return false;
 
-    WI = &getAnalysis<WIAnalysis>();
-
     // Collect all load / store instructions which can be replaced.
-    for (auto &B : F)
-        for (auto &I : B)
-            if (canOptLoadStore(&I))
+    for (auto &B : F) {
+        for (auto &I : B) {
+            if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I))
+                continue;
+
+            if (!canOptLoadStore(&I))
+                continue;
+
+            // Block read and write instructions must be called by all elements in the subgroup.
+            if (!WI->insideDivergentCF(&I)) {
                 LoadStoreToProcess.push_back(&I);
+            } else if (Loop *L = LI->getLoopFor(I.getParent())) {
+                // In some cases IGC can't proof that there is no code divergence in the loop.
+                // Handle these cases here.
+
+                // Check that the loop has been already analyzed.
+                if (LoadStoreInLoop.find(L) == LoadStoreInLoop.end()) {
+                    if (!isLoopPattern(L))
+                        continue;
+
+                    SmallVector<Instruction*, 32> Vec;
+                    Vec.push_back(&I);
+                    LoadStoreInLoop.insert(std::make_pair(L, Vec));
+                } else {
+                    LoadStoreInLoop[L].push_back(&I);
+                }
+            }
+        }
+    }
 
-    // Replace load / store instructions with block ones.
+    // Optimize cases without loops.
     for (auto I : LoadStoreToProcess)
-        Changed = changeToBlockInst(I);
+        Changed |= changeToBlockInst(I);
+
+    // Optimize cases with loops. Split loop into a remainder calculation and a new uniform loop.
+    // The remainder contains code divergence.
+    // The new loop will contain the main part of the loop without code divergence.
+    //
+    // For example:
+    //
+    // for (int idx = global_id_x + offset; idx < N; idx += simdsize) {
+    //    A[idx] = B[idx];
+    // }
+    //
+    // will be split into:
+    //
+    // if (global_id_x + offset < N - (N - offset) / simdsize * simdsize) {
+    //    A[idx] = B[idx];
+    // }
+    //
+    // for (int idx = global_id_x + offset + N - (N - offset) / simdsize * simdsize - offset; idx < N; idx += simdsize) {
+    //   auto x = sg.load(&B[idx]);
+    //   sg.store(&A[idx], x);
+    // }
+    //
+    for (const auto& Pair : LoadStoreInLoop) {
+        Loop *L = Pair.first;
+        BasicBlock *OldLatch = L->getLoopLatch();
+        BasicBlock *OldPreheader = L->getLoopPreheader();
+        PHINode *OldInductionPHI = L->getInductionVariable(*SE);
+        ICmpInst *OldLatchCmp = cast<ICmpInst>(cast<BranchInst>(OldLatch->getTerminator())->getCondition());
+        Value *OldLimit = OldLatchCmp->getOperand(1);
+        Value *OldIncomingIndV = OldInductionPHI->getIncomingValueForBlock(OldPreheader);
+
+        SmallVector<BasicBlock *, 1> ExitBlocks;
+        L->getExitBlocks(ExitBlocks);
+        BasicBlock *Exit = ExitBlocks[0];
+
+        // Get BranchInst which defines the condition for entering the loop.
+        BranchInst *PreConditionBranch = cast<BranchInst>(OldPreheader->getTerminator());
+        if (!PreConditionBranch->isConditional())
+            PreConditionBranch = cast<BranchInst>((*pred_begin(OldPreheader))->getTerminator());
+        ICmpInst *PreCondition = dyn_cast<ICmpInst>(PreConditionBranch->getCondition());
+
+        // Get offset for the initial value of the induction variable..
+        SmallVector<Value*, 2> Offset;
+        if (!getOffset(OldIncomingIndV, Offset))
+            continue;
+
+        // Create a new basic block which will separate the remainder and the new loop.
+        LLVMContext &Context = OldLatch->getContext();
+        BasicBlock *SeparatorBasicBlock = BasicBlock::Create(Context, ".separator", &F);
+        SeparatorBasicBlock->moveAfter(OldLatch);
+
+        // Clone the loop.
+        ValueToValueMapTy VMap;
+        BasicBlock *ClonedLatch = CloneBasicBlock(OldLatch, VMap, ".new.loop", &F);
+        for (auto &I : *ClonedLatch)
+            RemapInstruction(&I, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+        // Clone the pre-condition and pre-condition branch instructions in the separator block.
+        ICmpInst *ClonedPreCondition = cast<ICmpInst>(PreCondition->clone());
+        BranchInst *ClonedPreConditionBranch = cast<BranchInst>(PreConditionBranch->clone());
+        SeparatorBasicBlock->getInstList().push_back(ClonedPreCondition);
+        SeparatorBasicBlock->getInstList().push_back(ClonedPreConditionBranch);
+
+        // Create empty exit for the new loop.
+        BasicBlock *ExitForTheNewLoop = BasicBlock::Create(Context, ".new.exit", &F);
+        ExitForTheNewLoop->moveAfter(ClonedLatch);
+        IRBuilder<> Builder(ExitForTheNewLoop);
+        Builder.CreateBr(Exit);
+        Changed = true;
+
+        // Create empty preheader for the new loop.
+        BasicBlock *PreheaderForTheNewLoop = BasicBlock::Create(Context, ".new.preheader", &F);
+        PreheaderForTheNewLoop->moveAfter(SeparatorBasicBlock);
+        Builder.SetInsertPoint(PreheaderForTheNewLoop);
+        Builder.CreateBr(ClonedLatch);
+
+        // Update the cloned pre-condition branch successors.
+        ClonedPreConditionBranch->setCondition(ClonedPreCondition);
+        ClonedPreConditionBranch->setSuccessor(0, PreheaderForTheNewLoop);
+        ClonedPreConditionBranch->setSuccessor(1, Exit);
+
+        // Update the cloned latch branch successors.
+        BranchInst *ClonedLatchBranch = cast<BranchInst>(ClonedLatch->getTerminator());
+        ClonedLatchBranch->setSuccessor(0, ClonedLatch);
+        ClonedLatchBranch->setSuccessor(1, ExitForTheNewLoop);
+
+        // Insert the cloned latch block after the separator block.
+        ClonedLatch->moveAfter(SeparatorBasicBlock);
+
+        // Calculate the new limit for the remainder:
+        // newlimit = limit - (limit - offset1 - offset2) / simdsize * simdsize
+        //
+        // In IR it looks like:
+        //
+        // %suboffset1 = sub i32 %limit, %offset1
+        // %suboffset2 = sub i32 %suboffset1, %offset2
+        // %neg_qot = ashr i32 %suboffset2, log2(SimdSize)
+        // %qot = sub i32 0, %neg_qot
+        // %qotshl = shl i32 %qot, log2(SimdSize)
+        // %58 = add nsw i32 %limit, %qotshl
+        //
+        Type *LimitType = OldLimit->getType();
+
+        auto processOffset = [&](Value *SubArg) {
+            for (auto Val : Offset) {
+                if (!Val)
+                    break;
+
+                Value *OffsetVal = Val;
+                Type *ValType = Val->getType();
+                if (LimitType != ValType)
+                    OffsetVal = Builder.CreateZExt(Val, LimitType, "casted_offset");
+
+                SubArg = Builder.CreateSub(SubArg, OffsetVal);
+            }
+            return SubArg;
+        };
+
+        // Calculate the new limit (NewLimitFisrtLoop) for the remainder.
+        Builder.SetInsertPoint(PreCondition);
+        Value *SubOffset = processOffset(OldLimit);
+
+        int LogSimdSizeBase2 = std::log2(SimdSize);
+        Value *AshrInst = Builder.CreateAShr(SubOffset, ConstantInt::get(LimitType, LogSimdSizeBase2), "ashr");
+        Value *Neg = Builder.CreateSub(ConstantInt::get(LimitType, 0), AshrInst, "neg");
+        Value *Shl = Builder.CreateShl(Neg, ConstantInt::get(LimitType, LogSimdSizeBase2));
+        Value *NewLimitFisrtLoop = Builder.CreateAdd(OldLimit, Shl);
+
+        // Update cmp instruction in the remainder and preheader with new limit.
+        PreCondition->setOperand(1, NewLimitFisrtLoop);
+        OldLatchCmp->setOperand(1, NewLimitFisrtLoop);
+
+        // Calculate the induction variable initial value for the the new loop.
+        Builder.SetInsertPoint(SeparatorBasicBlock, SeparatorBasicBlock->getFirstInsertionPt());
+        Value *OffsetForNewLoop = processOffset(NewLimitFisrtLoop);
+
+        Value *NewIncInductiveVar = Builder.CreateAdd(OldIncomingIndV, OffsetForNewLoop);
+
+        // Set operands for the cloned pre-condition.
+        ClonedPreCondition->setOperand(0, NewIncInductiveVar);
+        ClonedPreCondition->setOperand(1, OldLimit);
+
+        // Substitude load/store instructions with block ones.
+        for (auto I : Pair.second) {
+            Instruction *NewI = cast<Instruction>(VMap[cast<Value>(I)]);
+            changeToBlockInst(NewI);
+        }
+
+        std::vector<PHINode*> PhiNodes;
+        // Set operands for phi instructions in the new loop and prepare initial values for the new loop.
+        for (auto &I : *OldLatch) {
+            if (!isa<PHINode>(&I))
+                break;
+
+            Value *IVal = cast<Value>(&I);
+            PHINode *Phi = cast<PHINode>(&I);
+            PHINode *NewPhi = dyn_cast<PHINode>(VMap[IVal]);
+            Value *OldIncomingV = Phi->getIncomingValueForBlock(OldPreheader);
+            PhiNodes.push_back(Phi);
+
+            for (unsigned i = 0; i < Phi->getNumIncomingValues(); ++i) {
+                if (NewPhi->getIncomingBlock(i) == OldLatch) {
+                    NewPhi->setIncomingBlock(i, ClonedLatch);
+                } else if (NewPhi->getIncomingBlock(i) == OldPreheader) {
+                    NewPhi->setIncomingBlock(i, PreheaderForTheNewLoop);
+                }
+            }
+
+            Value *NewInc = nullptr;
+            if (GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(OldIncomingV)) {
+                Type *GEPType = Gep->getResultElementType();
+                NewInc = Builder.CreateGEP(GEPType, OldIncomingV, OffsetForNewLoop);
+            } else if (Phi == OldInductionPHI) {
+                NewInc = NewIncInductiveVar;
+            }
+            NewPhi->setIncomingValueForBlock(PreheaderForTheNewLoop, NewInc);
+        }
+
+        // Erase phi instructions from values (make it if-statement).
+        for (auto Phi : PhiNodes) {
+            Value *OldIncomingV = Phi->getIncomingValueForBlock(OldPreheader);
+            Phi->replaceAllUsesWith(OldIncomingV);
+            Phi->eraseFromParent();
+        }
+
+        // Erase conditional branch from the old latch and creat unconditional branch.
+        BranchInst *OldLatchBranch = cast<BranchInst>(OldLatch->getTerminator());
+        Builder.SetInsertPoint(OldLatchBranch);
+        Builder.CreateBr(SeparatorBasicBlock);
+        OldLatchBranch->eraseFromParent();
+        PreConditionBranch->setSuccessor(1, SeparatorBasicBlock);
+    }
 
     return Changed;
 }
 
+using namespace llvm::PatternMatch;
+bool GenerateBlockMemOpsPass::getOffset(Value *Init, SmallVector<Value*, 2> &Offset) {
+    Value *NonUnifOp = Init;
+    while (NonUnifOp) {
+
+        if (ZExtInst *ZExt = dyn_cast<ZExtInst>(NonUnifOp)) {
+            NonUnifOp = ZExt->getOperand(0);
+        } else if (SExtInst *SExt = dyn_cast<SExtInst>(NonUnifOp)) {
+            NonUnifOp = SExt->getOperand(0);
+        } else if (Instruction *Inst = dyn_cast<Instruction>(NonUnifOp)) {
+            if (Inst->getOpcode() != Instruction::Add)
+                return false;
+
+            IGC::IGCMD::FunctionInfoMetaDataHandle FuncInfoMD = MdUtils->getFunctionsInfoItem(Inst->getFunction());
+            IGC::IGCMD::ThreadGroupSizeMetaDataHandle ThreadGroupSize = FuncInfoMD->getThreadGroupSize();
+
+            // ThreadGroupSize should be specified. It is checked earlier in checkVectorizationAlongX function.
+            IGC_ASSERT(ThreadGroupSize->hasValue());
+            int LogBase2 = std::log2((int32_t)ThreadGroupSize->getXDim());
+
+            // Check global_id_x pattern
+            Value *LocalIdX = nullptr;
+            Value *R0 = nullptr;
+            auto GlobalIdXPattern = m_Add(m_Shl(m_ExtractElt(m_Value(R0), m_SpecificInt(1)), m_SpecificInt(LogBase2)), m_Value(LocalIdX));
+            if (match(NonUnifOp, GlobalIdXPattern)) {
+                if (ZExtInst *ZExt = dyn_cast<ZExtInst>(LocalIdX))
+                    LocalIdX = ZExt->getOperand(0);
+
+                if (isLocalIdX(LocalIdX) && isR0(R0))
+                    return true;
+            }
+
+            Value *Op0 = Inst->getOperand(0);
+            Value *Op1 = Inst->getOperand(1);
+            if (!WI->isUniform(Op1) && !WI->isUniform(Op0))
+                return false;
+
+            if (Offset.size() == 2)
+                return false;
+
+            if (WI->isUniform(Op0)) {
+                Offset.push_back(Op0);
+                NonUnifOp = Op1;
+            } else {
+                Offset.push_back(Op1);
+                NonUnifOp = Op0;
+            }
+        } else {
+            return false;
+        }
+    }
+
+    return false;
+}
+
+bool GenerateBlockMemOpsPass::isLoopPattern(Loop *L) {
+    BasicBlock *Header = L->getHeader();
+    BasicBlock *Latch = L->getLoopLatch();
+    BasicBlock *Preheader = L->getLoopPreheader();
+    PHINode *Phi = L->getInductionVariable(*SE);
+
+    // Check that Loop has good shape so it safe to use llvm methods to work with it.
+    if (!L || !L->isSafeToClone() || (L->getNumBlocks() != 1) || !L->isLCSSAForm(*DT))
+        return false;
+
+    // Check that all parts of the loop can be found.
+    if (!Phi || !Preheader || !Latch || !Header)
+        return false;
+
+    ICmpInst *LatchCmp = dyn_cast<ICmpInst>(cast<BranchInst>(Latch->getTerminator())->getCondition());
+    if (!LatchCmp)
+        return false;
+
+    if (pred_size(Header) != 2)
+        return false;
+
+    // Check that the loop has only one exit block.
+    SmallVector<BasicBlock *, 4> ExitBlocks;
+    L->getExitBlocks(ExitBlocks);
+    if (ExitBlocks.size() != 1)
+        return false;
+
+    BasicBlock *Exit = ExitBlocks[0];
+
+    // Check that all values inside the loop have only internal users.
+    if (doesLoopHaveExternUse(L))
+        return false;
+
+    // Check that the loop has phi instructions of specific type.
+    if (!checkLoopPhiVals(L))
+        return false;
+
+    // Check that the induction variable is incremented by the simd size.
+    Instruction *Inc = dyn_cast<Instruction>(Phi->getIncomingValueForBlock(Latch));
+    if (Inc->getOpcode() != Instruction::Add || (Inc->getOperand(0) != Phi && Inc->getOperand(1) != Phi))
+        return false;
+
+    ConstantInt *CI = dyn_cast<ConstantInt>(Inc->getOperand(0));
+    if (!CI)
+        CI = dyn_cast<ConstantInt>(Inc->getOperand(1));
+    if (!CI)
+        return false;
+    if (CI->getValue() != SimdSize)
+        return false;
+
+    // Check that the loop condition is ULT or SLT.
+    CmpInst::Predicate Pred = LatchCmp->getPredicate();
+    if (Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_SLT)
+        return false;
+
+    // Loop limit should be uniform.
+    Value *Limit = LatchCmp->getOperand(1);
+    if (!WI->isUniform(Limit))
+        return false;
+
+    // Initial value for induction variable should be continuous.
+    Value *InitValForIndVar = Phi->getIncomingValueForBlock(Preheader);
+    if (!isIndexContinuous(InitValForIndVar))
+        return false;
+
+    // Find a conditional branch that defines if the loop should be executed.
+    // It can be placed in the preheader or in its single predecessor.
+    // This condition should match the condition in the loop latch.
+    BranchInst *PreConditionBranch = cast<BranchInst>(Preheader->getTerminator());
+    if (!PreConditionBranch->isConditional()) {
+        if (Preheader->size() != 1)
+            return false;
+
+        PreConditionBranch = nullptr;
+
+        if (Preheader->hasNPredecessors(1))
+            PreConditionBranch = cast<BranchInst>((*pred_begin(Preheader))->getTerminator());
+    }
+
+    if (!PreConditionBranch || !PreConditionBranch->isConditional())
+        return false;
+
+    ICmpInst *PreCondition = dyn_cast<ICmpInst>(PreConditionBranch->getCondition());
+    if (!PreCondition || PreCondition->getPredicate() != Pred || PreCondition->getOperand(1) != Limit)
+        return false;
+
+    if ((PreConditionBranch->getSuccessor(0) != Latch) && (PreConditionBranch->getSuccessor(0) != Preheader))
+        return false;
+
+    // That PreConditionBranch leads to the loop exit or to its single successor block.
+    if (PreConditionBranch->getSuccessor(1) != Exit) {
+        if (Exit->size() != 1)
+            return false;
+
+        BranchInst *ExitBranch = cast<BranchInst>(Exit->getTerminator());
+        if (ExitBranch->isConditional())
+            return false;
+
+        if (ExitBranch->getSuccessor(0) != PreConditionBranch->getSuccessor(1))
+            return false;
+    }
+
+    return true;
+}
+
+// Check that incoming values for phi instructions are getelementptr instructions except induction variable.
+bool GenerateBlockMemOpsPass::checkLoopPhiVals(Loop *L) {
+    BasicBlock *Preheader = L->getLoopPreheader();
+    BasicBlock *Latch = L->getLoopLatch();
+    PHINode *IndPhi = L->getInductionVariable(*SE);
+
+    for (auto &I : *Latch) {
+        PHINode *Phi = dyn_cast<PHINode>(&I);
+        if (!Phi)
+            break;
+
+        Value *IncomingVal = Phi->getIncomingValueForBlock(Preheader);
+        Value *InternalVal = Phi->getIncomingValueForBlock(Latch);
+
+        if (Phi != IndPhi) {
+            if (!isa<GetElementPtrInst>(IncomingVal))
+                return false;
+
+            if (!isa<GetElementPtrInst>(InternalVal))
+                return false;
+        }
+    }
+
+    return true;
+}
+
+// Check that loop has only internal users.
+bool GenerateBlockMemOpsPass::doesLoopHaveExternUse(Loop *L) {
+    // Expect that loop has only one exit block. It is checked earlier in checkLoopPattern function.
+    IGC_ASSERT(L->getNumBlocks() == 1);
+
+    BasicBlock *Latch = L->getLoopLatch();
+    for (auto &I : *Latch) {
+        for (auto UI = I.use_begin(), UE = I.use_end(); UI != UE; ++UI) {
+            Instruction *Inst = dyn_cast<Instruction>(*UI);
+            if (!Inst)
+                return true;
+
+            if (Inst->getParent() != Latch)
+                return true;
+        }
+    }
+
+    return false;
+}
+
 bool GenerateBlockMemOpsPass::isAddressAligned(Value *Ptr, const alignment_t &CurrentAlignment, Type *DataType) {
     unsigned ScalarSize = DataType->getScalarSizeInBits();
 
@@ -84,38 +525,70 @@ bool GenerateBlockMemOpsPass::isAddressAligned(Value *Ptr, const alignment_t &Cu
 
 // This function checks if Indx is equal to 1 * LocalIdX + UniformPart, assuming LocalIdY and LocalIdZ are uniform values.
 bool GenerateBlockMemOpsPass::isIndexContinuous(Value *Indx) {
-    Instruction *NonUnifInst = dyn_cast<Instruction>(Indx);
-    if (!NonUnifInst)
-        return false;
+    SmallVector<Value*, 2> NonUniformInstVector;
+    NonUniformInstVector.push_back(Indx);
+    PHINode *VisitedPhi = nullptr;
 
-    Value *NonUniformOp = nullptr;
     // Continuity requires that only add and zext operations can be performed on a non-uniform value.
-    while (NonUnifInst) {
-        if (isa<ZExtInst>(NonUnifInst)) {
-            NonUniformOp = NonUnifInst->getOperand(0);
-        } else if (NonUnifInst->getOpcode() == Instruction::Add) {
-            Value *Op0 = NonUnifInst->getOperand(0);
-            Value *Op1 = NonUnifInst->getOperand(1);
-            if (!WI->isUniform(Op1) && !WI->isUniform(Op0))
+    while (NonUniformInstVector.size()) {
+        for (auto It = NonUniformInstVector.begin(); It != NonUniformInstVector.end();) {
+            Value *NonUnifOp = *It;
+
+            if (!NonUnifOp)
                 return false;
 
-            if (WI->isUniform(Op0)) {
-                NonUniformOp = Op1;
-            } else {
-                NonUniformOp = Op0;
-            }
-        } else {
-            return false;
-        }
+            NonUniformInstVector.erase(It);
 
-        // If local_id_x was met then index is continuous.
-        if (isLocalIdX(NonUniformOp))
-            return true;
+            if (ZExtInst *ZExt = dyn_cast<ZExtInst>(NonUnifOp)) {
+                NonUniformInstVector.push_back(ZExt->getOperand(0));
+            } else if (SExtInst *SExt = dyn_cast<SExtInst>(NonUnifOp)) {
+                NonUniformInstVector.push_back(SExt ->getOperand(0));
+            } else if (PHINode *Phi = dyn_cast<PHINode>(NonUnifOp)) {
+                // Check that PHINode has two incoming values and one of them
+                // is calculated from local_id_x and another one from this PHINode.
+                if (VisitedPhi && VisitedPhi != Phi)
+                    return false;
+
+                if (VisitedPhi)
+                    continue;
+
+                unsigned NumIncomingValues = Phi->getNumIncomingValues();
+
+                if (NumIncomingValues != 2)
+                    return false;
+
+                for (Use &U : Phi->incoming_values()) {
+                    Value *V = U.get();
+                    if (WI->isUniform(V))
+                        return false;
 
-        NonUnifInst = dyn_cast<Instruction>(NonUniformOp);
+                    NonUniformInstVector.push_back(V);
+                }
+                VisitedPhi = Phi;
+            } else if (Instruction *Inst = dyn_cast<Instruction>(NonUnifOp)) {
+                if (Inst->getOpcode() != Instruction::Add)
+                    return false;
+
+                Value *Op0 = Inst->getOperand(0);
+                Value *Op1 = Inst->getOperand(1);
+
+
+                if (!WI->isUniform(Op1) && !WI->isUniform(Op0))
+                    return false;
+
+                if (WI->isUniform(Op0)) {
+                    NonUniformInstVector.push_back(Op1);
+                } else {
+                    NonUniformInstVector.push_back(Op0);
+                }
+            } else if (!isLocalIdX(NonUnifOp)) {
+                // If local_id_x was met then index is continuous.
+                return false;
+            }
+        }
     }
 
-    return false;
+    return true;
 }
 
 
@@ -123,40 +596,33 @@ bool GenerateBlockMemOpsPass::checkVectorizationAlongX(Function *F) {
     if (CGCtx->type != ShaderType::OPENCL_SHADER)
         return false;
 
-    IGC::IGCMD::FunctionInfoMetaDataHandle funcInfoMD = MdUtils->getFunctionsInfoItem(F);
-    ModuleMetaData *modMD = CGCtx->getModuleMetaData();
-    auto funcMD = modMD->FuncMD.find(F);
+    IGC::IGCMD::FunctionInfoMetaDataHandle FuncInfoMD = MdUtils->getFunctionsInfoItem(F);
+    ModuleMetaData *ModMD = CGCtx->getModuleMetaData();
+    auto FuncMD = ModMD->FuncMD.find(F);
 
-    if (funcMD == modMD->FuncMD.end())
+    if (FuncMD == ModMD->FuncMD.end())
         return false;
 
-    WorkGroupWalkOrderMD workGroupWalkOrder = funcMD->second.workGroupWalkOrder;
-    if (workGroupWalkOrder.dim0 != 0 || workGroupWalkOrder.dim1 != 1 || workGroupWalkOrder.dim2 != 2)
+    WorkGroupWalkOrderMD WorkGroupWalkOrder = FuncMD->second.workGroupWalkOrder;
+    if (WorkGroupWalkOrder.dim0 != 0 || WorkGroupWalkOrder.dim1 != 1 || WorkGroupWalkOrder.dim2 != 2)
         return false;
 
     int32_t X = -1;
-    IGC::IGCMD::ThreadGroupSizeMetaDataHandle threadGroupSize = funcInfoMD->getThreadGroupSize();
-    if (!threadGroupSize->hasValue())
+    IGC::IGCMD::ThreadGroupSizeMetaDataHandle ThreadGroupSize = FuncInfoMD->getThreadGroupSize();
+    if (!ThreadGroupSize->hasValue())
         return false;
 
-    X = (int32_t)threadGroupSize->getXDim();
+    X = (int32_t)ThreadGroupSize->getXDim();
     if (!X)
         return false;
 
-    if (X % MaxSgSize == 0)
+    if (X % SimdSize == 0)
         return true;
 
     return false;
 }
 
 bool GenerateBlockMemOpsPass::canOptLoadStore(Instruction *I) {
-    if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
-        return false;
-
-    // Block read and write instructions must be called by all elements in the subgroup.
-    if (WI->insideDivergentCF(I))
-        return false;
-
     Value *Ptr = nullptr;
     Value *ValOp = nullptr;
     Type *DataType = nullptr;
@@ -181,9 +647,9 @@ bool GenerateBlockMemOpsPass::canOptLoadStore(Instruction *I) {
     if (!isAddressAligned(Ptr, CurrentAlignment, DataType))
         return false;
 
-    GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
     // Get the last index from the getelementptr instruction if it is not uniform in the subgroup.
-    Value *Idx = checkGep(Gep);
+    Instruction *PtrInstr = dyn_cast<Instruction>(Ptr);
+    Value *Idx = checkGep(PtrInstr);
 
     if (!Idx)
         return false;
@@ -202,9 +668,21 @@ bool GenerateBlockMemOpsPass::isLocalIdX(const Value *InputVal) {
     Function *F = const_cast<Function *>(A->getParent());
     ImplicitArgs implicitArgs(*F, MdUtils);
     Value *localIdX = implicitArgs.getImplicitArgValue(*F, ImplicitArg::LOCAL_ID_X, MdUtils);
+
     return A == localIdX;
 }
 
+bool GenerateBlockMemOpsPass::isR0(const Value *InputVal) {
+    const Argument *A = dyn_cast<Argument>(InputVal);
+    if (!A)
+        return false;
+    Function *F = const_cast<Function *>(A->getParent());
+    ImplicitArgs implicitArgs(*F, MdUtils);
+    Value *R0 = implicitArgs.getImplicitArgValue(*F, ImplicitArg::R0, MdUtils);
+
+    return A == R0;
+}
+
 bool GenerateBlockMemOpsPass::changeToBlockInst(Instruction *I) {
     IRBuilder<> Builder(I);
     Function *BlockOpDecl = nullptr;
@@ -235,7 +713,31 @@ bool GenerateBlockMemOpsPass::changeToBlockInst(Instruction *I) {
     return true;
 }
 
-Value *GenerateBlockMemOpsPass::checkGep(GetElementPtrInst *Gep) {
+Value *GenerateBlockMemOpsPass::checkGep(Instruction *PtrInstr) {
+    if (!PtrInstr)
+        return nullptr;
+
+    PHINode *Phi = dyn_cast<PHINode>(PtrInstr);
+    GetElementPtrInst *Gep = nullptr;
+    if (Phi) {
+        unsigned NumIncomingValues = Phi->getNumIncomingValues();
+        if (NumIncomingValues != 2)
+            return nullptr;
+
+        BasicBlock *BB = PtrInstr->getParent();
+        Loop *L = LI->getLoopFor(BB);
+        BasicBlock *Preheader = L->getLoopPreheader();
+
+        Value *IncomingVal1 = Phi->getIncomingValueForBlock(Preheader);
+
+        Gep = dyn_cast<GetElementPtrInst>(IncomingVal1);
+
+        if (!Gep)
+            return nullptr;
+    } else {
+        Gep = dyn_cast<GetElementPtrInst>(PtrInstr);
+    }
+
     if (!Gep)
         return nullptr;
 
@@ -259,10 +761,41 @@ Value *GenerateBlockMemOpsPass::checkGep(GetElementPtrInst *Gep) {
     if (!IsLastIndUniform && IsPtrUniform) {
         return *LIndx;
     } else if (IsLastIndUniform && !IsPtrUniform) {
-        if (!isa<GetElementPtrInst>(Ptr))
+        if (!isa<PHINode>(Ptr) && !isa<GetElementPtrInst>(Ptr))
             return nullptr;
 
-        return checkGep(cast<GetElementPtrInst>(Ptr));
+        if (PHINode *Phi = dyn_cast<PHINode>(Ptr)) {
+            if (Phi->getNumIncomingValues() != 2)
+                return nullptr;
+
+            for (Use &U : Phi->incoming_values()) {
+                Value *V = U.get();
+
+                if (!isa<GetElementPtrInst>(V))
+                    return nullptr;
+
+                GetElementPtrInst *G  = cast<GetElementPtrInst>(V);
+
+                bool IsGepHasPhiArg = false;
+                if (G->getOperand(0) == Phi) {
+                    // Check that the address was incremented using gep instruction and the value incrementation is uniform.
+                    IsGepHasPhiArg = true;
+                    for (auto Idx = G->idx_begin(), E = G->idx_end(); Idx != E; Idx++) {
+                        if (!WI->isUniform(*Idx)) {
+                            return nullptr;
+                        }
+                    }
+                } else {
+                    // Get the incoming address value.
+                    Ptr = V;
+                }
+
+                if (!IsGepHasPhiArg)
+                    return nullptr;
+            }
+        }
+
+        return checkGep(dyn_cast<GetElementPtrInst>(Ptr));
     }
 
     return nullptr;
diff --git a/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.hpp b/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.hpp
index 4748647da5c3..7decfd9bbbc3 100644
--- a/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.hpp
+++ b/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.hpp
@@ -10,6 +10,8 @@ SPDX-License-Identifier: MIT
 
 #include "common/LLVMWarningsPush.hpp"
 #include <llvm/Pass.h>
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "common/LLVMWarningsPop.hpp"
 #include "Compiler/CISACodeGen/WIAnalysis.hpp"
 #include "GenISAIntrinsics/GenIntrinsicInst.h"
@@ -29,24 +31,35 @@ class GenerateBlockMemOpsPass : public llvm::FunctionPass
         }
 
         virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
-            AU.setPreservesCFG();
             AU.addRequired<CodeGenContextWrapper>();
             AU.addRequired<MetaDataUtilsWrapper>();
+            AU.addRequired<llvm::LoopInfoWrapperPass>();
+            AU.addRequired<llvm::ScalarEvolutionWrapperPass>();
+            AU.addRequired<llvm::DominatorTreeWrapperPass>();
             AU.addRequired<WIAnalysis>();
         }
 
         virtual bool runOnFunction(llvm::Function &F) override;
     private:
-        llvm::Value *checkGep(llvm::GetElementPtrInst *Gep);
+        llvm::Value *checkGep(llvm::Instruction *Gep);
         bool isLocalIdX(const llvm::Value *InputVal);
+        bool isR0(const llvm::Value *InputVal);
         bool isAddressAligned(llvm::Value *Ptr, const alignment_t &CurrentAlignment, llvm::Type *DataType);
         bool isIndexContinuous(llvm::Value *Addr);
         bool checkVectorizationAlongX(llvm::Function *F);
+        bool checkLoopPhiVals(llvm::Loop *L);
         bool changeToBlockInst(llvm::Instruction *I);
+        bool doesLoopHaveExternUse(llvm::Loop *L);
+        bool getOffset(llvm::Value *Init, llvm::SmallVector<llvm::Value*, 2> &Offset);
         bool canOptLoadStore(llvm::Instruction *I);
+        bool isLoopPattern(llvm::Loop *L);
 
         WIAnalysis *WI = nullptr;
         IGC::CodeGenContext *CGCtx = nullptr;
         IGC::IGCMD::MetaDataUtils *MdUtils = nullptr;
+        llvm::DominatorTree *DT = nullptr;
+        llvm::LoopInfo *LI;
+        llvm::ScalarEvolution *SE;
+        size_t SimdSize = 0;
     };
 }
\ No newline at end of file
diff --git a/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp b/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp
index 7b17b7e7b69c..f1a8a172b4c9 100644
--- a/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp
+++ b/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp
@@ -1408,12 +1408,6 @@ void OptimizeIR(CodeGenContext* const pContext)
         mpm.add(new IGCConstProp());
         GFX_ONLY_PASS { mpm.add(createTranslateToProgrammableOffsetsPass()); }
 
-        // This pass needs to be extended for other devices
-        if (pContext->platform.getPlatformInfo().eProductFamily == IGFX_PVC)
-        {
-            mpm.add(new GenerateBlockMemOpsPass());
-        }
-        mpm.add(new BlockMemOpAddrScalarizationPass());
 
         mpm.add(new CustomSafeOptPass());
         if (!pContext->m_DriverInfo.WADisableCustomPass())
@@ -1477,7 +1471,20 @@ void OptimizeIR(CodeGenContext* const pContext)
                 mpm.add(llvm::createLoopRotatePass(LOOP_ROTATION_HEADER_INST_THRESHOLD));
                 mpm.add(llvm::createLCSSAPass());
                 mpm.add(llvm::createLoopSimplifyPass());
+            }
+        }
 
+        // This pass needs to be extended for other devices
+        if (pContext->platform.getPlatformInfo().eProductFamily == IGFX_PVC)
+        {
+            mpm.add(new GenerateBlockMemOpsPass());
+        }
+        mpm.add(new BlockMemOpAddrScalarizationPass());
+
+        if (pContext->m_instrTypes.hasMultipleBB && !disableGOPT)
+        {
+            if (pContext->m_instrTypes.numOfLoop)
+            {
                 bool allowLICM = IGC_IS_FLAG_ENABLED(allowLICM) && pContext->m_retryManager.AllowLICM();
                 bool runGEPLSR = IGC_IS_FLAG_ENABLED(EnableGEPLSR) &&
                     pContext->type == ShaderType::OPENCL_SHADER &&
diff --git a/IGC/Compiler/tests/GenerateBlockMemOpsPass/block_read_write_check.ll b/IGC/Compiler/tests/GenerateBlockMemOpsPass/block_read_write_check.ll
index 6795d9a9ca30..9a275a06a5a6 100644
--- a/IGC/Compiler/tests/GenerateBlockMemOpsPass/block_read_write_check.ll
+++ b/IGC/Compiler/tests/GenerateBlockMemOpsPass/block_read_write_check.ll
@@ -60,7 +60,7 @@ entry:
   %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv.i
   store float %2, float addrspace(1)* %arrayidx1, align 4
 
-  ; CHECK-NOT: %{{.*}} = simdBlockWrite
+  ; CHECK-NOT: simdBlockWrite
 
   ret void
 
@@ -68,14 +68,47 @@ entry:
 
 }
 
-!igc.functions = !{!1, !2}
+define spir_kernel void @testYZUnifLoop(float addrspace(1)* %out, float addrspace(1)* %in, <8 x i32> %r0, <8 x i32> %payloadHeader, <3 x i32> %localSize, i16 %localIdX, i16 %localIdY, i16 %localIdZ, i32 %bufferOffset, i64 %limit) {
+; CHECK: %{{.*}} = load
+; CHECK: store
+; CHECK: [[TMP0:%.*]] = call float @llvm.genx.GenISA.simdBlockRead.f32.p1f32(float addrspace(1)* %{{.*}})
+; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1f32.f32(float addrspace(1)* %{{.*}}, float [[TMP0]])
+entry:
+  %offset = extractelement <8 x i32> %payloadHeader, i64 0
+  %groupNumX = extractelement <8 x i32> %r0, i64 1
+  %shl = shl i32 %groupNumX, 5
+  %localIdX31 = zext i16 %localIdX to i32
+  %globalIdX = add i32 %shl, %localIdX31
+  %sum = add i32 %globalIdX, %offset
+  %sum64 = zext i32 %sum to i64
+  %precond = icmp slt i64 %sum64, %limit
+  br i1 %precond, label %preheader, label %terminator
+preheader:
+  br label %latch
+latch:
+  %ind = phi i64 [ %sum64, %preheader ], [ %incr, %latch ]
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %ind
+  %load = load float, float addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %ind
+  store float %load, float addrspace(1)* %arrayidx1, align 4
+  %incr = add nsw i64 %ind, 32
+  %cond = icmp slt i64 %incr, %limit
+  br i1 %cond, label %latch, label %exit
+exit:
+  br label %terminator
+terminator:
+  ret void
+}
+
+!igc.functions = !{!1, !2, !3}
 !IGCMetadata = !{!19}
 
 !1 = !{void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i32)* @testYZUnif, !41}
 !2 = !{void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i32)* @testNoUnif, !42}
+!3 = !{void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i64)* @testYZUnifLoop, !43}
 !41 = !{!5, !6, !17}
 !42 = !{!5, !6}
-!43 = !{!5, !6, !18}
+!43 = !{!5, !6, !17}
 !5 = !{!"function_type", i32 0}
 !6 = !{!"implicit_arg_desc", !7, !8, !9, !10, !11, !12, !13, !15}
 !7 = !{i32 0}
@@ -99,11 +132,13 @@ entry:
 
 !18 = !{!"thread_group_size", i32 16, i32 32, i32 32}
 !19 = !{!"ModuleMD", !112}
-!112 = !{!"FuncMD", !113, !114, !333, !334}
+!112 = !{!"FuncMD", !113, !114, !333, !334, !335, !336}
 !113 = !{!"FuncMDMap[0]", void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i32)* @testYZUnif}
 !114 = !{!"FuncMDValue[0]", !116}
 !333 = !{!"FuncMDMap[1]", void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i32)* @testNoUnif}
 !334 = !{!"FuncMDValue[1]", !116}
+!335 = !{!"FuncMDMap[2]", void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i64)* @testYZUnifLoop}
+!336 = !{!"FuncMDValue[2]", !116}
 !116 = !{!"workGroupWalkOrder", !117, !118, !119}
 !117 = !{!"dim0", i32 0}
 !118 = !{!"dim1", i32 1}