Skip to content

Commit

Permalink
Fix detection of memory writes in ConstantCoalescing
Browse files Browse the repository at this point in the history
When merging loads to writable resources, ConstantCoalescing must check if there
are no memory write or atomic operations in all potentially reachable basic
blocks between the two merge candidates.
  • Loading branch information
mmerecki authored and igcbot committed Jun 28, 2023
1 parent db40bd6 commit 3af2e8f
Show file tree
Hide file tree
Showing 3 changed files with 198 additions and 26 deletions.
144 changes: 128 additions & 16 deletions IGC/Compiler/CISACodeGen/ConstantCoalescing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,22 +400,9 @@ void ConstantCoalescing::ProcessBlock(
std::vector<BufChunk*> & indcb_owloads,
std::vector<BufChunk*> & indcb_gathers)
{
// checking for stores for future load merging check
bool bbHasStores = false;
for (BasicBlock::iterator BBI_st = blk->begin(), BBE_st = blk->end();
BBI_st != BBE_st; ++BBI_st)
{
if (isa<StoreInst>(BBI_st) ||
isa<StoreRawIntrinsic>(BBI_st))
{
bbHasStores = true;
break;
}
}

bool isCPS = false;

bool skipLdrawOpt = isCPS || bbHasStores;
bool skipLdrawOpt = isCPS;
// get work-item analysis, need to update uniformness information
for (BasicBlock::iterator BBI = blk->begin(), BBE = blk->end();
BBI != BBE; ++BBI)
Expand Down Expand Up @@ -843,7 +830,8 @@ void ConstantCoalescing::MergeScatterLoad(Instruction* load,
BufChunk* cur_chunk = *rit;
if (CompareBufferBase(cur_chunk->bufIdxV, cur_chunk->addrSpace, bufIdxV, addrSpace) &&
cur_chunk->baseIdxV == eltIdxV &&
cur_chunk->chunkIO->getType()->getScalarType() == load->getType()->getScalarType())
cur_chunk->chunkIO->getType()->getScalarType() == load->getType()->getScalarType() &&
!CheckForAliasingWrites(addrSpace, cur_chunk->chunkIO, load))
{
uint lb = std::min(eltid, cur_chunk->chunkStart);
uint ub = std::max(eltid + maxEltPlus, cur_chunk->chunkStart + cur_chunk->chunkSize);
Expand Down Expand Up @@ -1242,7 +1230,8 @@ void ConstantCoalescing::MergeUniformLoad(Instruction* load,
{
if (CompareBufferBase(cur_chunk->bufIdxV, cur_chunk->addrSpace, bufIdxV, addrSpace) &&
cur_chunk->baseIdxV == eltIdxV &&
cur_chunk->chunkIO->getType()->getScalarType() == LoadEltTy)
cur_chunk->chunkIO->getType()->getScalarType() == LoadEltTy &&
!CheckForAliasingWrites(addrSpace, cur_chunk->chunkIO, load))
{
uint lb = std::min(eltid, cur_chunk->chunkStart);
uint ub = std::max(eltid + maxEltPlus, cur_chunk->chunkStart + cur_chunk->chunkSize);
Expand Down Expand Up @@ -2619,4 +2608,127 @@ void ConstantCoalescing::ReplaceLoadWithSamplerLoad(
loadToReplace->replaceAllUsesWith(result);
}

// Check if any of the paths from the `dominatingChunk` to `mergeCandidate` has
// aliasing memory store or atomic operations.
bool ConstantCoalescing::CheckForAliasingWrites(
uint32_t addrSpace,
Instruction* dominatingChunk,
Instruction* mergeCandidate)
{
BufferType accessType = DecodeBufferType(addrSpace);
// Only writable resources need to be checked.
if (accessType == STATELESS_READONLY ||
accessType == CONSTANT_BUFFER ||
accessType == BINDLESS_CONSTANT_BUFFER ||
accessType == SSH_BINDLESS_CONSTANT_BUFFER)
{
return false;
}

const DominatorTree& DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
const bool statefulResourcesNotAliased =
m_ctx->getModuleMetaData()->statefulResourcesNotAliased;
BasicBlock* chunkBB = dominatingChunk->getParent();
BasicBlock* candidateBB = mergeCandidate->getParent();

// Lambda checks if an instruction may write to memory that potentially
// aliases the loaded memory.
auto IsAliasedWrite = [accessType, statefulResourcesNotAliased](
Instruction* inst)
{
// Treat memory fences as aliasing writes as they may change results
// of subsequent loads.
auto* intr = dyn_cast<GenIntrinsicInst>(inst);
if (intr &&
(intr->getIntrinsicID() == GenISAIntrinsic::GenISA_memoryfence ||
intr->getIntrinsicID() == GenISAIntrinsic::GenISA_LSCFence))
{
return true;
}
if (!inst->mayWriteToMemory())
{
return false;
}
Value* ptr = GetBufferOperand(inst);
if (ptr == nullptr || !ptr->getType()->isPointerTy())
{
return false;
}
BufferType writeAccessType = DecodeBufferType(
ptr->getType()->getPointerAddressSpace());
if (writeAccessType != STATELESS &&
writeAccessType != UAV &&
writeAccessType != BINDLESS &&
writeAccessType != SSH_BINDLESS)
{
return false;
}
if (statefulResourcesNotAliased &&
writeAccessType != accessType)
{
// Note: the following improvements are possible:
// - check BTI index for direct access to UAV/CONSTANT_BUFFER
// - consider checking the "unique_id" part of the addressspace
// for bindless access
return false;
}
// By default assume that the write access may alias the load
return true;
};

// Lambda checks if a range of instructions has an instruction that may
// write to memory that potentially aliases the loaded memory.
auto HasAliasedWrite = [&IsAliasedWrite](
auto beginIt,
auto endIt)
{
for (auto II = beginIt, EI = endIt; II != EI; ++II)
{
if (IsAliasedWrite(&*II))
{
return true;
}
}
return false;
};

bool sameBB = chunkBB == candidateBB;
// First check the merge candidate BB
if (HasAliasedWrite(
sameBB ? dominatingChunk->getIterator() : candidateBB->begin(),
mergeCandidate->getIterator()))
{
return true;
}
if (!sameBB)
{
// Check the dominating BB
if (HasAliasedWrite(dominatingChunk->getIterator(), chunkBB->end()))
{
return true;
}
// Check all potentially reachable BBs starting from predecessors of the
// candidate merge BB and going up to to the dominating chunk BB.
IGC_ASSERT(DT.dominates(chunkBB, candidateBB));
SmallSet<BasicBlock*, 16> seen;
SmallVector<BasicBlock*, 16> worklist(pred_begin(candidateBB), pred_end(candidateBB));
seen.insert(chunkBB);
seen.insert(candidateBB);
while (!worklist.empty())
{
BasicBlock* bb = worklist.pop_back_val();
if (seen.count(bb) > 0)
{
continue;
}
if (HasAliasedWrite(bb->begin(), bb->end()))
{
return true;
}
seen.insert(bb);
worklist.insert(worklist.end(), pred_begin(bb), pred_end(bb));
}
}
return false;
}
char IGC::ConstantCoalescing::ID = 0;
5 changes: 5 additions & 0 deletions IGC/Compiler/CISACodeGen/ConstantCoalescing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,11 @@ namespace IGC

alignment_t GetAlignment(Instruction* load) const;
void SetAlignment(Instruction* load, uint alignment);

bool CheckForAliasingWrites(
uint32_t addrSpace,
llvm::Instruction* dominatingChunk,
llvm::Instruction* mergeCandidate);
};

}
75 changes: 65 additions & 10 deletions IGC/Compiler/tests/ConstantCoalescing/ldraws.ll
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ entry:
; CHECK: ret <4 x float> %2


define <4 x float> @f1(i32 %src, i8 addrspace(2490373)* %dst) {
define <4 x float> @f1(i32 %src) {
entry:
%0 = inttoptr i32 %src to i8 addrspace(2490373)*
%1 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %src, i32 4, i1 false)
%2 = add i32 %src, 16
store i8 5, i8 addrspace(2490373)* %dst
call void @llvm.genx.GenISA.storeraw.indexed.p2490368i8.f32(i8 addrspace(2490373)* %0, i32 %src, float 0x0, i32 4, i1 false)
%2 = add i32 %src, 4
%3 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %2, i32 4, i1 false)
%4 = add i32 %src, 32
%5 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %4, i32 4, i1 false)
Expand All @@ -53,19 +53,74 @@ entry:
}

; CHECK-LABEL: define <4 x float> @f1
; CHECK: %1 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %src, i32 4, i1 false)
; CHECK: store i8 5, i8 addrspace(2490373)* %dst, align 1
; CHECK: %3 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %2, i32 4, i1 false)
; CHECK: %5 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %4, i32 4, i1 false)
; CHECK: %7 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %6, i32 4, i1 false)
; CHECK: %1 = call <1 x float> @llvm.genx.GenISA.ldrawvector.indexed.v1f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %src, i32 4, i1 false)
; CHECK: %2 = extractelement <1 x float> %1, {{i[0-9]+}} 0
; CHECK: call void @llvm.genx.GenISA.storeraw.indexed.p2490368i8.f32(i8 addrspace(2490373)* %0, i32 %src, float 0.000000e+00, i32 4, i1 false)
; CHECK: %3 = add i32 %src, 4
; CHECK: %4 = call <16 x float> @llvm.genx.GenISA.ldrawvector.indexed.v16f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %3, i32 4, i1 false)
; CHECK: %5 = extractelement <16 x float> %4, {{i[0-9]+}} 11
; CHECK: %6 = extractelement <16 x float> %4, {{i[0-9]+}} 7
; CHECK: %7 = extractelement <16 x float> %4, {{i[0-9]+}} 0
; CHECK: %8 = insertelement <4 x float> undef, float %2, {{i[0-9]+}} 0
; CHECK: %9 = insertelement <4 x float> %8, float %7, {{i[0-9]+}} 1
; CHECK: %10 = insertelement <4 x float> %9, float %6, {{i[0-9]+}} 2
; CHECK: %11 = insertelement <4 x float> %10, float %5, {{i[0-9]+}} 3
; CHECK: ret <4 x float> %11

define <4 x float> @f2(i32 %src) {
entry:
%0 = inttoptr i32 %src to i8 addrspace(2490373)*
%1 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %src, i32 4, i1 false)
br label %storeBB
storeBB:
call void @llvm.genx.GenISA.storeraw.indexed.p2490368i8.f32(i8 addrspace(2490373)* %0, i32 %src, float 0x0, i32 4, i1 false)
br label %exitBB
exitBB:
%2 = add i32 %src, 4
%3 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %2, i32 4, i1 false)
%4 = add i32 %src, 32
%5 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %4, i32 4, i1 false)
%6 = add i32 %src, 48
%7 = call fast float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %6, i32 4, i1 false)
%8 = insertelement <4 x float> undef, float %1, i32 0
%9 = insertelement <4 x float> %8, float %3, i32 1
%10 = insertelement <4 x float> %9, float %5, i32 2
%11 = insertelement <4 x float> %10, float %7, i32 3
ret <4 x float> %11
}

; CHECK-LABEL: define <4 x float> @f2
; CHECK: %1 = call <1 x float> @llvm.genx.GenISA.ldrawvector.indexed.v1f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %src, i32 4, i1 false)
; CHECK: %2 = extractelement <1 x float> %1, {{i[0-9]+}} 0
; CHECK: br label %storeBB
; CHECK-LABEL: storeBB:
; CHECK: call void @llvm.genx.GenISA.storeraw.indexed.p2490368i8.f32(i8 addrspace(2490373)* %0, i32 %src, float 0.000000e+00, i32 4, i1 false)
; CHECK: br label %exitBB
; CHECK-LABEL: exitBB:
; CHECK: %3 = add i32 %src, 4
; CHECK: %4 = call <16 x float> @llvm.genx.GenISA.ldrawvector.indexed.v16f32.p2490373i8(i8 addrspace(2490373)* %0, i32 %3, i32 4, i1 false)
; CHECK: %5 = extractelement <16 x float> %4, {{i[0-9]+}} 11
; CHECK: %6 = extractelement <16 x float> %4, {{i[0-9]+}} 7
; CHECK: %7 = extractelement <16 x float> %4, {{i[0-9]+}} 0
; CHECK: %8 = insertelement <4 x float> undef, float %2, {{i[0-9]+}} 0
; CHECK: %9 = insertelement <4 x float> %8, float %7, {{i[0-9]+}} 1
; CHECK: %10 = insertelement <4 x float> %9, float %6, {{i[0-9]+}} 2
; CHECK: %11 = insertelement <4 x float> %10, float %5, {{i[0-9]+}} 3
; CHECK: ret <4 x float> %11


; Function Attrs: argmemonly nounwind readonly
declare float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(i8 addrspace(2490373)*, i32, i32, i1) argmemonly nounwind readonly

!igc.functions = !{!0, !3}
; Function Attrs: argmemonly nounwind writeonly
declare void @llvm.genx.GenISA.storeraw.indexed.p2490368i8.f32(i8 addrspace(2490373)*, i32, float, i32, i1) argmemonly nounwind writeonly


!igc.functions = !{!0, !3, !4}

!0 = !{<4 x float> (i32)* @f0, !1}
!3 = !{<4 x float> (i32, i8 addrspace(2490373)*)* @f1, !1}
!3 = !{<4 x float> (i32)* @f1, !1}
!4 = !{<4 x float> (i32)* @f2, !1}

!1 = !{!2}
!2 = !{!"function_type", i32 0}
Expand Down

0 comments on commit 3af2e8f

Please sign in to comment.