diff --git a/IGC/Compiler/CISACodeGen/ConstantCoalescing.cpp b/IGC/Compiler/CISACodeGen/ConstantCoalescing.cpp
index 7eba3a3f7d68..228b431ffb29 100644
--- a/IGC/Compiler/CISACodeGen/ConstantCoalescing.cpp
+++ b/IGC/Compiler/CISACodeGen/ConstantCoalescing.cpp
@@ -400,22 +400,9 @@ void ConstantCoalescing::ProcessBlock(
     std::vector<BufChunk*> & indcb_owloads,
     std::vector<BufChunk*> & indcb_gathers)
 {
-    // checking for stores for future load merging check
-    bool bbHasStores = false;
-    for (BasicBlock::iterator BBI_st = blk->begin(), BBE_st = blk->end();
-        BBI_st != BBE_st; ++BBI_st)
-    {
-        if (isa<StoreInst>(BBI_st) ||
-            isa<StoreRawIntrinsic>(BBI_st))
-        {
-            bbHasStores = true;
-            break;
-        }
-    }
-
     bool isCPS = false;
 
-    bool skipLdrawOpt = isCPS || bbHasStores;
+    bool skipLdrawOpt = isCPS;
     // get work-item analysis, need to update uniformness information
     for (BasicBlock::iterator BBI = blk->begin(), BBE = blk->end();
          BBI != BBE; ++BBI)
@@ -843,7 +830,8 @@ void ConstantCoalescing::MergeScatterLoad(Instruction* load,
         BufChunk* cur_chunk = *rit;
         if (CompareBufferBase(cur_chunk->bufIdxV, cur_chunk->addrSpace, bufIdxV, addrSpace) &&
             cur_chunk->baseIdxV == eltIdxV &&
-            cur_chunk->chunkIO->getType()->getScalarType() == load->getType()->getScalarType())
+            cur_chunk->chunkIO->getType()->getScalarType() == load->getType()->getScalarType() &&
+            !CheckForAliasingWrites(addrSpace, cur_chunk->chunkIO, load))
         {
             uint lb = std::min(eltid, cur_chunk->chunkStart);
             uint ub = std::max(eltid + maxEltPlus, cur_chunk->chunkStart + cur_chunk->chunkSize);
@@ -1242,7 +1230,8 @@ void ConstantCoalescing::MergeUniformLoad(Instruction* load,
     {
         if (CompareBufferBase(cur_chunk->bufIdxV, cur_chunk->addrSpace, bufIdxV, addrSpace) &&
             cur_chunk->baseIdxV == eltIdxV &&
-            cur_chunk->chunkIO->getType()->getScalarType() == LoadEltTy)
+            cur_chunk->chunkIO->getType()->getScalarType() == LoadEltTy &&
+            !CheckForAliasingWrites(addrSpace, cur_chunk->chunkIO, load))
         {
             uint lb = std::min(eltid, cur_chunk->chunkStart);
             uint ub = std::max(eltid + maxEltPlus, cur_chunk->chunkStart + cur_chunk->chunkSize);
@@ -2619,4 +2608,127 @@ void ConstantCoalescing::ReplaceLoadWithSamplerLoad(
     loadToReplace->replaceAllUsesWith(result);
 }
 
+// Check if any of the paths from the `dominatingChunk` to `mergeCandidate` has
+// aliasing memory store or atomic operations.
+bool ConstantCoalescing::CheckForAliasingWrites(
+    uint32_t addrSpace,
+    Instruction* dominatingChunk,
+    Instruction* mergeCandidate)
+{
+    BufferType accessType = DecodeBufferType(addrSpace);
+    // Only writable resources need to be checked.
+    if (accessType == STATELESS_READONLY ||
+        accessType == CONSTANT_BUFFER ||
+        accessType == BINDLESS_CONSTANT_BUFFER ||
+        accessType == SSH_BINDLESS_CONSTANT_BUFFER)
+    {
+        return false;
+    }
+
+    const DominatorTree& DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    const bool statefulResourcesNotAliased =
+        m_ctx->getModuleMetaData()->statefulResourcesNotAliased;
+    BasicBlock* chunkBB = dominatingChunk->getParent();
+    BasicBlock* candidateBB = mergeCandidate->getParent();
+
+    // Lambda checks if an instruction may write to memory that potentially
+    // aliases the loaded memory.
+    auto IsAliasedWrite = [accessType, statefulResourcesNotAliased](
+        Instruction* inst)
+    {
+        // Treat memory fences as aliasing writes as they may change results
+        // of subsequent loads.
+        auto* intr = dyn_cast<GenIntrinsicInst>(inst);
+        if (intr &&
+            (intr->getIntrinsicID() == GenISAIntrinsic::GenISA_memoryfence ||
+             intr->getIntrinsicID() == GenISAIntrinsic::GenISA_LSCFence))
+        {
+            return true;
+        }
+        if (!inst->mayWriteToMemory())
+        {
+            return false;
+        }
+        Value* ptr = GetBufferOperand(inst);
+        if (ptr == nullptr || !ptr->getType()->isPointerTy())
+        {
+            return false;
+        }
+        BufferType writeAccessType = DecodeBufferType(
+            ptr->getType()->getPointerAddressSpace());
+        if (writeAccessType != STATELESS &&
+            writeAccessType != UAV &&
+            writeAccessType != BINDLESS &&
+            writeAccessType != SSH_BINDLESS)
+        {
+            return false;
+        }
+        if (statefulResourcesNotAliased &&
+            writeAccessType != accessType)
+        {
+            // Note: the following improvements are possible:
+            // - check BTI index for direct access to UAV/CONSTANT_BUFFER
+            // - consider checking the "unique_id" part of the addressspace
+            //   for bindless access
+            return false;
+        }
+        // By default assume that the write access may alias the load
+        return true;
+    };
+
+    // Lambda checks if a range of instructions has an instruction that may
+    // write to memory that potentially aliases the loaded memory.
+    auto HasAliasedWrite = [&IsAliasedWrite](
+        auto beginIt,
+        auto endIt)
+    {
+        for (auto II = beginIt, EI = endIt; II != EI; ++II)
+        {
+            if (IsAliasedWrite(&*II))
+            {
+                return true;
+            }
+        }
+        return false;
+    };
+
+    bool sameBB = chunkBB == candidateBB;
+    // First check the merge candidate BB
+    if (HasAliasedWrite(
+        sameBB ? dominatingChunk->getIterator() : candidateBB->begin(),
+        mergeCandidate->getIterator()))
+    {
+        return true;
+    }
+    if (!sameBB)
+    {
+        // Check the dominating BB
+        if (HasAliasedWrite(dominatingChunk->getIterator(), chunkBB->end()))
+        {
+            return true;
+        }
+        // Check all potentially reachable BBs starting from predecessors of the
+        // candidate merge BB and going up to to the dominating chunk BB.
+        IGC_ASSERT(DT.dominates(chunkBB, candidateBB));
+        SmallSet<BasicBlock*, 16> seen;
+        SmallVector<BasicBlock*, 16> worklist(pred_begin(candidateBB), pred_end(candidateBB));
+        seen.insert(chunkBB);
+        seen.insert(candidateBB);
+        while (!worklist.empty())
+        {
+            BasicBlock* bb = worklist.pop_back_val();
+            if (seen.count(bb) > 0)
+            {
+                continue;
+            }
+            if (HasAliasedWrite(bb->begin(), bb->end()))
+            {
+                return true;
+            }
+            seen.insert(bb);
+            worklist.insert(worklist.end(), pred_begin(bb), pred_end(bb));
+        }
+    }
+    return false;
+}
 char IGC::ConstantCoalescing::ID = 0;
diff --git a/IGC/Compiler/CISACodeGen/ConstantCoalescing.hpp b/IGC/Compiler/CISACodeGen/ConstantCoalescing.hpp
index dc64688948dc..a250b9bfd253 100644
--- a/IGC/Compiler/CISACodeGen/ConstantCoalescing.hpp
+++ b/IGC/Compiler/CISACodeGen/ConstantCoalescing.hpp
@@ -307,6 +307,11 @@ namespace IGC
 
         alignment_t GetAlignment(Instruction* load) const;
         void SetAlignment(Instruction* load, uint alignment);
+
+        bool CheckForAliasingWrites(
+            uint32_t addrSpace,
+            llvm::Instruction* dominatingChunk,
+            llvm::Instruction* mergeCandidate);
     };
 
 }
diff --git a/IGC/Compiler/tests/ConstantCoalescing/ldraws.ll b/IGC/Compiler/tests/ConstantCoalescing/ldraws.ll
index 7a21516abec1..1efce3995e2b 100644
--- a/IGC/Compiler/tests/ConstantCoalescing/ldraws.ll
+++ b/IGC/Compiler/tests/ConstantCoalescing/ldraws.ll
@@ -34,12 +34,12 @@ entry:
  ; CHECK: ret <4 x float> %2
 
 
-define <4 x float> @f1(i32 %src, i8 addrspace(2490373)* %dst) {
+define <4 x float> @f1(i32 %src) {
 entry:
   %0 = inttoptr i32 %src to i8 addrspace(2490373)*
   %1 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %src, i32 4, i1 false)
-  %2 = add i32 %src, 16
-  store i8 5, i8 addrspace(2490373)* %dst
+  call void @llvm.genx.GenISA.storeraw.indexed.p2490368i8.f32(i8 addrspace(2490373)* %0, i32 %src, float 0x0, i32 4, i1 false)
+  %2 = add i32 %src, 4
   %3 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %2, i32 4, i1 false)
   %4 = add i32 %src, 32
   %5 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %4, i32 4, i1 false)
@@ -53,19 +53,74 @@ entry:
 }
 
  ; CHECK-LABEL: define <4 x float> @f1
- ; CHECK: %1 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %src, i32 4, i1 false)
- ; CHECK: store i8 5, i8 addrspace(2490373)* %dst, align 1
- ; CHECK: %3 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %2, i32 4, i1 false)
- ; CHECK: %5 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %4, i32 4, i1 false)
- ; CHECK: %7 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %6, i32 4, i1 false)
+ ; CHECK: %1 = call <1 x float> @llvm.genx.GenISA.ldrawvector.indexed.v1f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %src, i32 4, i1 false)
+ ; CHECK: %2 = extractelement <1 x float> %1, {{i[0-9]+}} 0
+ ; CHECK: call void @llvm.genx.GenISA.storeraw.indexed.p2490368i8.f32(i8 addrspace(2490373)* %0, i32 %src, float 0.000000e+00, i32 4, i1 false)
+ ; CHECK: %3 = add i32 %src, 4
+ ; CHECK: %4 = call <16 x float> @llvm.genx.GenISA.ldrawvector.indexed.v16f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %3, i32 4, i1 false)
+ ; CHECK: %5 = extractelement <16 x float> %4, {{i[0-9]+}} 11
+ ; CHECK: %6 = extractelement <16 x float> %4, {{i[0-9]+}} 7
+ ; CHECK: %7 = extractelement <16 x float> %4, {{i[0-9]+}} 0
+ ; CHECK: %8 = insertelement <4 x float> undef, float %2, {{i[0-9]+}} 0
+ ; CHECK: %9 = insertelement <4 x float> %8, float %7, {{i[0-9]+}} 1
+ ; CHECK: %10 = insertelement <4 x float> %9, float %6, {{i[0-9]+}} 2
+ ; CHECK: %11 = insertelement <4 x float> %10, float %5, {{i[0-9]+}} 3
+ ; CHECK: ret <4 x float> %11
+
+define <4 x float> @f2(i32 %src) {
+entry:
+  %0 = inttoptr i32 %src to i8 addrspace(2490373)*
+  %1 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %src, i32 4, i1 false)
+  br label %storeBB
+storeBB:
+  call void @llvm.genx.GenISA.storeraw.indexed.p2490368i8.f32(i8 addrspace(2490373)* %0, i32 %src, float 0x0, i32 4, i1 false)
+  br label %exitBB
+exitBB:
+  %2 = add i32 %src, 4
+  %3 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %2, i32 4, i1 false)
+  %4 = add i32 %src, 32
+  %5 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %4, i32 4, i1 false)
+  %6 = add i32 %src, 48
+  %7 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %6, i32 4, i1 false)
+  %8 = insertelement <4 x float> undef, float %1, i32 0
+  %9 = insertelement <4 x float> %8, float %3, i32 1
+  %10 = insertelement <4 x float> %9, float %5, i32 2
+  %11 = insertelement <4 x float> %10, float %7, i32 3
+  ret <4 x float> %11
+}
+
+ ; CHECK-LABEL: define <4 x float> @f2
+ ; CHECK: %1 = call <1 x float> @llvm.genx.GenISA.ldrawvector.indexed.v1f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %src, i32 4, i1 false)
+ ; CHECK: %2 = extractelement <1 x float> %1, {{i[0-9]+}} 0
+ ; CHECK:  br label %storeBB
+ ; CHECK-LABEL: storeBB:
+ ; CHECK: call void @llvm.genx.GenISA.storeraw.indexed.p2490368i8.f32(i8 addrspace(2490373)* %0, i32 %src, float 0.000000e+00, i32 4, i1 false)
+ ; CHECK: br label %exitBB
+ ; CHECK-LABEL: exitBB:
+ ; CHECK: %3 = add i32 %src, 4
+ ; CHECK: %4 = call <16 x float> @llvm.genx.GenISA.ldrawvector.indexed.v16f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %3, i32 4, i1 false)
+ ; CHECK: %5 = extractelement <16 x float> %4, {{i[0-9]+}} 11
+ ; CHECK: %6 = extractelement <16 x float> %4, {{i[0-9]+}} 7
+ ; CHECK: %7 = extractelement <16 x float> %4, {{i[0-9]+}} 0
+ ; CHECK: %8 = insertelement <4 x float> undef, float %2, {{i[0-9]+}} 0
+ ; CHECK: %9 = insertelement <4 x float> %8, float %7, {{i[0-9]+}} 1
+ ; CHECK: %10 = insertelement <4 x float> %9, float %6, {{i[0-9]+}} 2
+ ; CHECK: %11 = insertelement <4 x float> %10, float %5, {{i[0-9]+}} 3
+ ; CHECK: ret <4 x float> %11
+
 
 ; Function Attrs: argmemonly nounwind readonly
 declare float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)*, i32, i32, i1) argmemonly nounwind readonly
 
-!igc.functions = !{!0, !3}
+; Function Attrs: argmemonly nounwind writeonly
+declare void @llvm.genx.GenISA.storeraw.indexed.p2490368i8.f32(i8 addrspace(2490373)*, i32, float, i32, i1) argmemonly nounwind writeonly
+
+
+!igc.functions = !{!0, !3, !4}
 
 !0 = !{<4 x float> (i32)* @f0, !1}
-!3 = !{<4 x float> (i32, i8 addrspace(2490373)*)* @f1, !1}
+!3 = !{<4 x float> (i32)* @f1, !1}
+!4 = !{<4 x float> (i32)* @f2, !1}
 
 !1 = !{!2}
 !2 = !{!"function_type", i32 0}