diff --git a/IGC/Compiler/CISACodeGen/ConstantCoalescing.cpp b/IGC/Compiler/CISACodeGen/ConstantCoalescing.cpp index 7eba3a3f7d68..228b431ffb29 100644 --- a/IGC/Compiler/CISACodeGen/ConstantCoalescing.cpp +++ b/IGC/Compiler/CISACodeGen/ConstantCoalescing.cpp @@ -400,22 +400,9 @@ void ConstantCoalescing::ProcessBlock( std::vector & indcb_owloads, std::vector & indcb_gathers) { - // checking for stores for future load merging check - bool bbHasStores = false; - for (BasicBlock::iterator BBI_st = blk->begin(), BBE_st = blk->end(); - BBI_st != BBE_st; ++BBI_st) - { - if (isa(BBI_st) || - isa(BBI_st)) - { - bbHasStores = true; - break; - } - } - bool isCPS = false; - bool skipLdrawOpt = isCPS || bbHasStores; + bool skipLdrawOpt = isCPS; // get work-item analysis, need to update uniformness information for (BasicBlock::iterator BBI = blk->begin(), BBE = blk->end(); BBI != BBE; ++BBI) @@ -843,7 +830,8 @@ void ConstantCoalescing::MergeScatterLoad(Instruction* load, BufChunk* cur_chunk = *rit; if (CompareBufferBase(cur_chunk->bufIdxV, cur_chunk->addrSpace, bufIdxV, addrSpace) && cur_chunk->baseIdxV == eltIdxV && - cur_chunk->chunkIO->getType()->getScalarType() == load->getType()->getScalarType()) + cur_chunk->chunkIO->getType()->getScalarType() == load->getType()->getScalarType() && + !CheckForAliasingWrites(addrSpace, cur_chunk->chunkIO, load)) { uint lb = std::min(eltid, cur_chunk->chunkStart); uint ub = std::max(eltid + maxEltPlus, cur_chunk->chunkStart + cur_chunk->chunkSize); @@ -1242,7 +1230,8 @@ void ConstantCoalescing::MergeUniformLoad(Instruction* load, { if (CompareBufferBase(cur_chunk->bufIdxV, cur_chunk->addrSpace, bufIdxV, addrSpace) && cur_chunk->baseIdxV == eltIdxV && - cur_chunk->chunkIO->getType()->getScalarType() == LoadEltTy) + cur_chunk->chunkIO->getType()->getScalarType() == LoadEltTy && + !CheckForAliasingWrites(addrSpace, cur_chunk->chunkIO, load)) { uint lb = std::min(eltid, cur_chunk->chunkStart); uint ub = std::max(eltid + maxEltPlus, cur_chunk->chunkStart + cur_chunk->chunkSize); @@ -2619,4 +2608,127 @@ void ConstantCoalescing::ReplaceLoadWithSamplerLoad( loadToReplace->replaceAllUsesWith(result); } +// Check if any of the paths from the `dominatingChunk` to `mergeCandidate` has +// aliasing memory store or atomic operations. +bool ConstantCoalescing::CheckForAliasingWrites( + uint32_t addrSpace, + Instruction* dominatingChunk, + Instruction* mergeCandidate) +{ + BufferType accessType = DecodeBufferType(addrSpace); + // Only writable resources need to be checked. + if (accessType == STATELESS_READONLY || + accessType == CONSTANT_BUFFER || + accessType == BINDLESS_CONSTANT_BUFFER || + accessType == SSH_BINDLESS_CONSTANT_BUFFER) + { + return false; + } + + const DominatorTree& DT = getAnalysis().getDomTree(); + const bool statefulResourcesNotAliased = + m_ctx->getModuleMetaData()->statefulResourcesNotAliased; + BasicBlock* chunkBB = dominatingChunk->getParent(); + BasicBlock* candidateBB = mergeCandidate->getParent(); + + // Lambda checks if an instruction may write to memory that potentially + // aliases the loaded memory. + auto IsAliasedWrite = [accessType, statefulResourcesNotAliased]( + Instruction* inst) + { + // Treat memory fences as aliasing writes as they may change results + // of subsequent loads. + auto* intr = dyn_cast(inst); + if (intr && + (intr->getIntrinsicID() == GenISAIntrinsic::GenISA_memoryfence || + intr->getIntrinsicID() == GenISAIntrinsic::GenISA_LSCFence)) + { + return true; + } + if (!inst->mayWriteToMemory()) + { + return false; + } + Value* ptr = GetBufferOperand(inst); + if (ptr == nullptr || !ptr->getType()->isPointerTy()) + { + return false; + } + BufferType writeAccessType = DecodeBufferType( + ptr->getType()->getPointerAddressSpace()); + if (writeAccessType != STATELESS && + writeAccessType != UAV && + writeAccessType != BINDLESS && + writeAccessType != SSH_BINDLESS) + { + return false; + } + if (statefulResourcesNotAliased && + writeAccessType != accessType) + { + // Note: the following improvements are possible: + // - check BTI index for direct access to UAV/CONSTANT_BUFFER + // - consider checking the "unique_id" part of the addressspace + // for bindless access + return false; + } + // By default assume that the write access may alias the load + return true; + }; + + // Lambda checks if a range of instructions has an instruction that may + // write to memory that potentially aliases the loaded memory. + auto HasAliasedWrite = [&IsAliasedWrite]( + auto beginIt, + auto endIt) + { + for (auto II = beginIt, EI = endIt; II != EI; ++II) + { + if (IsAliasedWrite(&*II)) + { + return true; + } + } + return false; + }; + + bool sameBB = chunkBB == candidateBB; + // First check the merge candidate BB + if (HasAliasedWrite( + sameBB ? dominatingChunk->getIterator() : candidateBB->begin(), + mergeCandidate->getIterator())) + { + return true; + } + if (!sameBB) + { + // Check the dominating BB + if (HasAliasedWrite(dominatingChunk->getIterator(), chunkBB->end())) + { + return true; + } + // Check all potentially reachable BBs starting from predecessors of the + // candidate merge BB and going up to to the dominating chunk BB. + IGC_ASSERT(DT.dominates(chunkBB, candidateBB)); + SmallSet seen; + SmallVector worklist(pred_begin(candidateBB), pred_end(candidateBB)); + seen.insert(chunkBB); + seen.insert(candidateBB); + while (!worklist.empty()) + { + BasicBlock* bb = worklist.pop_back_val(); + if (seen.count(bb) > 0) + { + continue; + } + if (HasAliasedWrite(bb->begin(), bb->end())) + { + return true; + } + seen.insert(bb); + worklist.insert(worklist.end(), pred_begin(bb), pred_end(bb)); + } + } + return false; +} char IGC::ConstantCoalescing::ID = 0; diff --git a/IGC/Compiler/CISACodeGen/ConstantCoalescing.hpp b/IGC/Compiler/CISACodeGen/ConstantCoalescing.hpp index dc64688948dc..a250b9bfd253 100644 --- a/IGC/Compiler/CISACodeGen/ConstantCoalescing.hpp +++ b/IGC/Compiler/CISACodeGen/ConstantCoalescing.hpp @@ -307,6 +307,11 @@ namespace IGC alignment_t GetAlignment(Instruction* load) const; void SetAlignment(Instruction* load, uint alignment); + + bool CheckForAliasingWrites( + uint32_t addrSpace, + llvm::Instruction* dominatingChunk, + llvm::Instruction* mergeCandidate); }; } diff --git a/IGC/Compiler/tests/ConstantCoalescing/ldraws.ll b/IGC/Compiler/tests/ConstantCoalescing/ldraws.ll index 7a21516abec1..1efce3995e2b 100644 --- a/IGC/Compiler/tests/ConstantCoalescing/ldraws.ll +++ b/IGC/Compiler/tests/ConstantCoalescing/ldraws.ll @@ -34,12 +34,12 @@ entry: ; CHECK: ret <4 x float> %2 -define <4 x float> @f1(i32 %src, i8 addrspace(2490373)* %dst) { +define <4 x float> @f1(i32 %src) { entry: %0 = inttoptr i32 %src to i8 addrspace(2490373)* %1 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %src, i32 4, i1 false) - %2 = add i32 %src, 16 - store i8 5, i8 addrspace(2490373)* %dst + call void @llvm.genx.GenISA.storeraw.indexed.p2490368i8.f32(i8 addrspace(2490373)* %0, i32 %src, float 0x0, i32 4, i1 false) + %2 = add i32 %src, 4 %3 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %2, i32 4, i1 false) %4 = add i32 %src, 32 %5 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %4, i32 4, i1 false) @@ -53,19 +53,74 @@ entry: } ; CHECK-LABEL: define <4 x float> @f1 - ; CHECK: %1 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %src, i32 4, i1 false) - ; CHECK: store i8 5, i8 addrspace(2490373)* %dst, align 1 - ; CHECK: %3 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %2, i32 4, i1 false) - ; CHECK: %5 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %4, i32 4, i1 false) - ; CHECK: %7 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %6, i32 4, i1 false) + ; CHECK: %1 = call <1 x float> @llvm.genx.GenISA.ldrawvector.indexed.v1f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %src, i32 4, i1 false) + ; CHECK: %2 = extractelement <1 x float> %1, {{i[0-9]+}} 0 + ; CHECK: call void @llvm.genx.GenISA.storeraw.indexed.p2490368i8.f32(i8 addrspace(2490373)* %0, i32 %src, float 0.000000e+00, i32 4, i1 false) + ; CHECK: %3 = add i32 %src, 4 + ; CHECK: %4 = call <16 x float> @llvm.genx.GenISA.ldrawvector.indexed.v16f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %3, i32 4, i1 false) + ; CHECK: %5 = extractelement <16 x float> %4, {{i[0-9]+}} 11 + ; CHECK: %6 = extractelement <16 x float> %4, {{i[0-9]+}} 7 + ; CHECK: %7 = extractelement <16 x float> %4, {{i[0-9]+}} 0 + ; CHECK: %8 = insertelement <4 x float> undef, float %2, {{i[0-9]+}} 0 + ; CHECK: %9 = insertelement <4 x float> %8, float %7, {{i[0-9]+}} 1 + ; CHECK: %10 = insertelement <4 x float> %9, float %6, {{i[0-9]+}} 2 + ; CHECK: %11 = insertelement <4 x float> %10, float %5, {{i[0-9]+}} 3 + ; CHECK: ret <4 x float> %11 + +define <4 x float> @f2(i32 %src) { +entry: + %0 = inttoptr i32 %src to i8 addrspace(2490373)* + %1 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %src, i32 4, i1 false) + br label %storeBB +storeBB: + call void @llvm.genx.GenISA.storeraw.indexed.p2490368i8.f32(i8 addrspace(2490373)* %0, i32 %src, float 0x0, i32 4, i1 false) + br label %exitBB +exitBB: + %2 = add i32 %src, 4 + %3 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %2, i32 4, i1 false) + %4 = add i32 %src, 32 + %5 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %4, i32 4, i1 false) + %6 = add i32 %src, 48 + %7 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %6, i32 4, i1 false) + %8 = insertelement <4 x float> undef, float %1, i32 0 + %9 = insertelement <4 x float> %8, float %3, i32 1 + %10 = insertelement <4 x float> %9, float %5, i32 2 + %11 = insertelement <4 x float> %10, float %7, i32 3 + ret <4 x float> %11 +} + + ; CHECK-LABEL: define <4 x float> @f2 + ; CHECK: %1 = call <1 x float> @llvm.genx.GenISA.ldrawvector.indexed.v1f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %src, i32 4, i1 false) + ; CHECK: %2 = extractelement <1 x float> %1, {{i[0-9]+}} 0 + ; CHECK: br label %storeBB + ; CHECK-LABEL: storeBB: + ; CHECK: call void @llvm.genx.GenISA.storeraw.indexed.p2490368i8.f32(i8 addrspace(2490373)* %0, i32 %src, float 0.000000e+00, i32 4, i1 false) + ; CHECK: br label %exitBB + ; CHECK-LABEL: exitBB: + ; CHECK: %3 = add i32 %src, 4 + ; CHECK: %4 = call <16 x float> @llvm.genx.GenISA.ldrawvector.indexed.v16f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %3, i32 4, i1 false) + ; CHECK: %5 = extractelement <16 x float> %4, {{i[0-9]+}} 11 + ; CHECK: %6 = extractelement <16 x float> %4, {{i[0-9]+}} 7 + ; CHECK: %7 = extractelement <16 x float> %4, {{i[0-9]+}} 0 + ; CHECK: %8 = insertelement <4 x float> undef, float %2, {{i[0-9]+}} 0 + ; CHECK: %9 = insertelement <4 x float> %8, float %7, {{i[0-9]+}} 1 + ; CHECK: %10 = insertelement <4 x float> %9, float %6, {{i[0-9]+}} 2 + ; CHECK: %11 = insertelement <4 x float> %10, float %5, {{i[0-9]+}} 3 + ; CHECK: ret <4 x float> %11 + ; Function Attrs: argmemonly nounwind readonly declare float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)*, i32, i32, i1) argmemonly nounwind readonly -!igc.functions = !{!0, !3} +; Function Attrs: argmemonly nounwind writeonly +declare void @llvm.genx.GenISA.storeraw.indexed.p2490368i8.f32(i8 addrspace(2490373)*, i32, float, i32, i1) argmemonly nounwind writeonly + + +!igc.functions = !{!0, !3, !4} !0 = !{<4 x float> (i32)* @f0, !1} -!3 = !{<4 x float> (i32, i8 addrspace(2490373)*)* @f1, !1} +!3 = !{<4 x float> (i32)* @f1, !1} +!4 = !{<4 x float> (i32)* @f2, !1} !1 = !{!2} !2 = !{!"function_type", i32 0}