Skip to content

Commit

Permalink
Adding an experimental feature for load coalescing
Browse files Browse the repository at this point in the history
Adding an experimental feature for load coalescing
  • Loading branch information
mkhoshza authored and igcbot committed Oct 24, 2024
1 parent b6c58e9 commit cdaeaee
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 3 deletions.
78 changes: 75 additions & 3 deletions IGC/Compiler/CISACodeGen/MemOpt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2716,6 +2716,8 @@ namespace {
return splitVectorType(V, LdStKind::IS_LOAD);
}

void AllowDummyLoadCoalescing(InstAndOffsetPairs Loads);

// GatherCopy:
// copy multiple values (arg: Vals) into a single Dst (return value)
// (It's a packed copy, thus size(all Vals) = size(Dst).
Expand Down Expand Up @@ -2822,7 +2824,7 @@ bool IGC::doLdStCombine(const CodeGenContext* CGC) {
uint32_t keyval = IGC_GET_FLAG_VALUE(EnableLdStCombine);
if ((keyval & 0x3) == 1 && !CGC->platform.LSCEnabled())
return false;
return ((keyval & 0x3) != 0);
return ((keyval & 0x3) || (keyval & 0x4));
}

uint32_t IGC::getMaxStoreBytes(const CodeGenContext* CGC) {
Expand Down Expand Up @@ -3351,9 +3353,13 @@ void LdStCombine::combineLoads()
if ((IGC_GET_FLAG_VALUE(EnableLdStCombine) & 0x4) == 0)
return;

// Start with OCL, then apply to other APIs.
if (m_CGC->type != ShaderType::OPENCL_SHADER)
return;
{
if (!m_CGC->getModuleMetaData()->compOpt.EnableLdStCombineforLoad)
{
return;
}
}

// All load candidates with addr = common-base + const-offset
InstAndOffsetPairs Loads;
Expand Down Expand Up @@ -3439,6 +3445,14 @@ void LdStCombine::combineLoads()
}
}

//Experiment: If its the last element of the load and does not fit the DWORD alignment,
//It creates a dummy load with the same alignment type as the previous load
if (m_CGC->type != ShaderType::OPENCL_SHADER)
{
if (m_CGC->getModuleMetaData()->compOpt.EnableLdStCombinewithDummyLoad)
AllowDummyLoadCoalescing(Loads);
}

// Note: For now, each load is considered once. For example,
// load a
// store x : alias to load c
Expand Down Expand Up @@ -3802,6 +3816,64 @@ void LdStCombine::createBundles(BasicBlock* BB, InstAndOffsetPairs& LoadStores)
markVisited(LoadStores);
}

void LdStCombine::AllowDummyLoadCoalescing(InstAndOffsetPairs Loads)
{
// Currently supports only this pattern.
// % 164 = add i32 % 114, 1020
// % 165 = and i32 % 164, 1020
// % 166 = getelementptr[1024 x half], [1024 x half] addrspace(3) * null, i32 0, i32 % 165
// %167 = load half, half addrspace(3) * %166, align 8
// % 168 = or i32 % 165, 1
// % 169 = getelementptr[1024 x half], [1024 x half] addrspace(3) * null, i32 0, i32 % 168
// % 170 = load half, half addrspace(3) * %169, align 2
// % 171 = or i32 % 165, 2
// % 172 = getelementptr[1024 x half], [1024 x half] addrspace(3) * null, i32 0, i32 % 171
// % 173 = load half, half addrspace(3) * %172, align 4
// to
// % 164 = add i32 % 114, 1020
// % 165 = and i32 % 164, 1020
// % 166 = getelementptr[1024 x half], [1024 x half] addrspace(3) * null, i32 0, i32 % 165
// %167 = load half, half addrspace(3) * %166, align 8
// % 168 = or i32 % 165, 1
// % 169 = getelementptr[1024 x half], [1024 x half] addrspace(3) * null, i32 0, i32 % 168
// % 170 = load half, half addrspace(3) * %169, align 2
// % 171 = or i32 % 165, 2
// % 172 = getelementptr[1024 x half], [1024 x half] addrspace(3) * null, i32 0, i32 % 171
// % 173 = load half, half addrspace(3) * %172, align 4
// % 174 = add i32 % 165, 3
// % 175 = getelementptr[1024 x half], [1024 x half] addrspace(3) * null, i32 0, i32 % 174
// % 176 = load half, half addrspace(3) * %175, align 2
int size = Loads.size();
LdStInfo LastLoad = Loads[size - 1];
uint32_t LastLoadSize = (uint32_t)m_DL->getTypeStoreSize(LastLoad.Inst->getType());
uint32_t currLoadSize = LastLoadSize + LastLoad.ByteOffset;
if (currLoadSize % 4)
{
//Replicating the last load to make it DWORD aligned
uint32_t newLoadSize = LastLoadSize;
if (!((currLoadSize + newLoadSize) % 4))
{
LoadInst* lead = static_cast<LoadInst*>(LastLoad.Inst);
Value* ldPtr = lead->getPointerOperand();
if (auto gep = dyn_cast<GetElementPtrInst>(ldPtr))
{
if ((gep->getNumOperands() == 3) && (isa<ConstantPointerNull>(gep->getPointerOperand())))
{
IRBuilder<> irBuilder(LastLoad.Inst);
Value* AddInst = irBuilder.CreateAdd(gep->getOperand(2), irBuilder.getInt32(1));
Value* gepArg[] = { gep->getOperand(1), AddInst };
Value* Addr = irBuilder.CreateInBoundsGEP(gep->getSourceElementType(),
gep->getOperand(0), gepArg);
Instruction* dummyLoad = static_cast<Instruction*>
(irBuilder.CreateLoad(IGCLLVM::getNonOpaquePtrEltTy(Addr->getType()), Addr));
Loads.push_back(LdStInfo(dummyLoad, LastLoad.ByteOffset + newLoadSize));
}
}
}
}
return;
}

// A member of layout struct can be a vector type. This function will decide
// if the vector type or a sequence of its elements' types shall be used as
// the layout struct's member types. If spliting a vector type into a sequence
Expand Down
2 changes: 2 additions & 0 deletions IGC/common/MDFrameWork.h
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,8 @@ namespace IGC
bool DisableConstantCoalescing = false;
bool EnableUndefAlphaOutputAsRed = true;
bool WaEnableALTModeVisaWA = false;
bool EnableLdStCombineforLoad = false;
bool EnableLdStCombinewithDummyLoad = false;
bool NewSpillCostFunction = false;
bool ForceLargeGRFNum4RQ = false;
bool DisableEUFusion = false;
Expand Down
1 change: 1 addition & 0 deletions IGC/common/igc_flags.h
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@ DECLARE_IGC_REGKEY(DWORD, MemOptGEPCanon, 2, "[test] GEP canon
DECLARE_IGC_REGKEY(bool, DisableMemOpt2, false, "Disable MemOpt2", false)
DECLARE_IGC_REGKEY(bool, EnableExplicitCopyForByVal, true, "Enable generating an explicit copy (alloca + memcpy) in a caller for aggregate argumentes with byval attribute", true)
DECLARE_IGC_REGKEY(DWORD, EnableLdStCombine, 1, "Enable load/store combine pass if set to 1 (lsc message only) or 2; bit 3 = 1 [tmp for testing] : enabled load combine (intend to replace memopt)", true)
DECLARE_IGC_REGKEY(bool, EnableLdStCombinewithDummyLoad, false, "Adds extra load instruction to increase the size of coalesced load", true)
DECLARE_IGC_REGKEY(DWORD, MaxStoreVectorSizeInBytes, 0, "[LdStCombine] the max non-uniform vector size for the coalesced store. 0: compiler choice (default, 16(4DW)); others: 4/8/16/32", true)
DECLARE_IGC_REGKEY(DWORD, MaxLoadVectorSizeInBytes, 0, "[LdStCombine] the max non-uniform vector size for the coalesced load. 0: compiler choice (default, 16(4DW)); others: 4/8/16/32", true)
DECLARE_IGC_REGKEY(bool, DisableMergeStore, false, "[temp]If EnableLdStCombine is on, disable mergestore (memopt) if this is set. Temp key for testing", true)
Expand Down

0 comments on commit cdaeaee

Please sign in to comment.