Skip to content

Commit

Permalink
Split vectors containing dead elements
Browse files Browse the repository at this point in the history
Use LiveElements analysis in GenXLegalization pass to see if there are
any dead elements in a vector. If so, split instruction to isolate
the dead parts, which will be removed by later passes
  • Loading branch information
mshelego authored and igcbot committed Sep 21, 2023
1 parent 3c0d1ee commit e41de38
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 9 deletions.
32 changes: 31 additions & 1 deletion IGC/VectorCompiler/lib/GenXCodeGen/GenXLegalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ SPDX-License-Identifier: MIT
#include "GenXAlignmentInfo.h"
#include "GenXBaling.h"
#include "GenXIntrinsics.h"
#include "GenXLiveElements.h"
#include "GenXSubtarget.h"
#include "GenXTargetMachine.h"
#include "GenXUtil.h"
Expand Down Expand Up @@ -213,6 +214,7 @@ struct LegalPredSize {
class GenXLegalization : public FunctionPass {
enum { DETERMINEWIDTH_UNBALE = 0, DETERMINEWIDTH_NO_SPLIT = 256 };
GenXBaling *Baling = nullptr;
GenXFuncLiveElements *LE = nullptr;
const GenXSubtarget *ST = nullptr;
DominatorTree *DT = nullptr;
ScalarEvolution *SE = nullptr;
Expand Down Expand Up @@ -406,6 +408,7 @@ void initializeGenXLegalizationPass(PassRegistry &);
INITIALIZE_PASS_BEGIN(GenXLegalization, "GenXLegalization", "GenXLegalization",
false, false)
INITIALIZE_PASS_DEPENDENCY(GenXFuncBaling)
INITIALIZE_PASS_DEPENDENCY(GenXFuncLiveElements)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_END(GenXLegalization, "GenXLegalization", "GenXLegalization",
Expand All @@ -418,6 +421,7 @@ FunctionPass *llvm::createGenXLegalizationPass() {

void GenXLegalization::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<GenXFuncBaling>();
AU.addRequired<GenXFuncLiveElements>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<TargetPassConfig>();
AU.addRequired<DominatorTreeWrapperPass>();
Expand All @@ -430,6 +434,7 @@ void GenXLegalization::getAnalysisUsage(AnalysisUsage &AU) const {
*/
bool GenXLegalization::runOnFunction(Function &F) {
Baling = &getAnalysis<GenXFuncBaling>();
LE = &getAnalysis<GenXFuncLiveElements>();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
ST = &getAnalysis<TargetPassConfig>()
.getTM<GenXTargetMachine>()
Expand Down Expand Up @@ -1141,14 +1146,39 @@ unsigned GenXLegalization::determineWidth(unsigned WholeWidth,
// * this legalization pass does not have access to FGs
ExecSizeAllowedBits &= 0x1f;

auto *Head = B.getHeadIgnoreGStore();

if (WholeWidth > 1) {
Value *Dest = Head->Inst;
if (Head->Info.Type == BaleInfo::WRREGION ||
Head->Info.Type == BaleInfo::WRPREDREGION ||
Head->Info.Type == BaleInfo::WRPREDPREDREGION)
Dest = Head->Inst->getOperand(1);
auto LiveElems = LE->getLiveElements(Dest);
if (LiveElems.canSplitDead()) {
IGC_ASSERT(LiveElems[0].size() == WholeWidth);
bool StartBit = LiveElems[0][StartIdx];
unsigned Idx = StartIdx + 1;
while (Idx < LiveElems[0].size() && LiveElems[0][Idx] == StartBit)
Idx++;
unsigned Size = Idx - StartIdx;
unsigned Mask = 0;
while (Size) {
Mask <<= 1;
Mask |= 1;
Size >>= 1;
}
ExecSizeAllowedBits &= Mask;
}
}

unsigned MainInstMinWidth =
1 << countTrailingZeros(ExecSizeAllowedBits, ZB_Undefined);
// Determine the vector width that we need to split into.
bool IsReadSameVector = false;
unsigned Width = WholeWidth - StartIdx;
unsigned PredMinWidth = 1;
Value *WrRegionInput = nullptr;
auto Head = B.getHeadIgnoreGStore();
if (Head->Info.Type == BaleInfo::WRREGION)
WrRegionInput =
Head->Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum);
Expand Down
15 changes: 9 additions & 6 deletions IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveElements.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,6 @@ LiveElements LiveElementsAnalysis::getOperandLiveElements(
IGC_ASSERT(OperandNo < Inst->getNumOperands());
auto OpTy = Inst->getOperand(OperandNo)->getType();

if (InstLiveElems.isAllDead() && !Inst->mayHaveSideEffects())
return LiveElements(OpTy);

if (auto BCI = dyn_cast<BitCastInst>(Inst))
return getBitCastLiveElements(BCI, InstLiveElems);

Expand Down Expand Up @@ -299,10 +296,16 @@ LiveElements LiveElementsAnalysis::getOperandLiveElements(
if (ID == GenXIntrinsic::genx_addc || ID == GenXIntrinsic::genx_subb)
return getTwoDstInstLiveElements(InstLiveElems);

if (isElementWise(Inst))
return InstLiveElems;
auto OpLiveElems = LiveElements(OpTy, !InstLiveElems.isAllDead() ||
Inst->mayHaveSideEffects());
if (!isElementWise(Inst) || InstLiveElems.size() != OpLiveElems.size())
return OpLiveElems;

for (unsigned Idx = 0; Idx < InstLiveElems.size(); Idx++)
if (InstLiveElems[Idx].size() != OpLiveElems[Idx].size())
return OpLiveElems;

return LiveElements(OpTy, true);
return InstLiveElems;
}

// isRootInst : check if instruction should be the start point for backward
Expand Down
10 changes: 10 additions & 0 deletions IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveElements.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,16 @@ class LiveElements {
});
}

bool canSplitDead() const {
if (size() > 1 || isAllDead() || !isAnyDead())
return false;
auto Bits = LiveElems[0];
for (int Idx = Bits.find_first(); Idx <= Bits.find_last(); Idx++)
if (!Bits[Idx])
return false;
return true;
}

bool operator==(const LiveElements &Rhs) const {
return LiveElems == Rhs.LiveElems;
}
Expand Down
2 changes: 1 addition & 1 deletion IGC/VectorCompiler/test/GenXLegalization/debug_gstore.ll
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ define void @test_transform(<128 x i8>* %a) !dbg !6 {
entry:
%0 = load <128 x i8>, <128 x i8>* %a, !dbg !12
call void @llvm.dbg.value(metadata <128 x i8> %0, metadata !9, metadata !DIExpression()), !dbg !12
%1 = call <128 x i8> @llvm.genx.rdregioni.v128i8.v128i8.i16(<128 x i8> %0, i32 1, i32 1, i32 0, i16 16, i32 0), !dbg !13
%1 = call <128 x i8> @llvm.genx.rdregioni.v128i8.v128i8.i16(<128 x i8> %0, i32 1, i32 1, i32 0, i16 0, i32 0), !dbg !13
call void @llvm.dbg.value(metadata <128 x i8> %1, metadata !9, metadata !DIExpression()), !dbg !13
store <128 x i8> %1, <128 x i8>* @global_vec, !dbg !14
ret void, !dbg !15
Expand Down
2 changes: 1 addition & 1 deletion IGC/VectorCompiler/test/Legalization/debug-gstore.ll
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ define void @test_transform(<128 x i8>* %a) !dbg !6 {
entry:
%0 = load <128 x i8>, <128 x i8>* %a, !dbg !12
call void @llvm.dbg.value(metadata <128 x i8> %0, metadata !9, metadata !DIExpression()), !dbg !12
%1 = call <128 x i8> @llvm.genx.rdregioni.v128i8.v128i8.i16(<128 x i8> %0, i32 1, i32 1, i32 0, i16 16, i32 0), !dbg !13
%1 = call <128 x i8> @llvm.genx.rdregioni.v128i8.v128i8.i16(<128 x i8> %0, i32 1, i32 1, i32 0, i16 0, i32 0), !dbg !13
call void @llvm.dbg.value(metadata <128 x i8> %1, metadata !11, metadata !DIExpression()), !dbg !13
store <128 x i8> %1, <128 x i8>* @global_vec, !dbg !14
ret void, !dbg !15
Expand Down
85 changes: 85 additions & 0 deletions IGC/VectorCompiler/test/Legalization/live-elements.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
;=========================== begin_copyright_notice ============================
;
; Copyright (C) 2023 Intel Corporation
;
; SPDX-License-Identifier: MIT
;
;============================ end_copyright_notice =============================

; RUN: opt %use_old_pass_manager% -GenXLegalization -march=genx64 -mcpu=Gen9 -mtriple=spir64-unknown-unknown -S < %s | FileCheck %s

declare <16 x i32> @llvm.genx.oword.ld.v16i32(i32, i32, i32)

declare <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32>, i32, i32, i32, i16, i32)

declare void @llvm.genx.oword.st.v8i32(i32, i32, <8 x i32>)

; CHECK-LABEL: @test1
; CHECK: [[LOAD:%[^ ]+]] = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0)
; CHECK-NEXT: [[LOAD_SPLIT0:%[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[LOAD]], i32 8, i32 8, i32 1, i16 0, i32 undef)
; CHECK-NEXT: [[ADD_SPLIT0:%[^ ]+]] = add <8 x i32> [[LOAD_SPLIT0]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[ADD_SPLIT0_JOIN0:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> undef, <8 x i32> [[ADD_SPLIT0]], i32 0, i32 8, i32 1, i16 0, i32 undef, i1 true)
; CHECK-NEXT: [[LOAD_SPLIT8:%[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[LOAD]], i32 8, i32 8, i32 1, i16 32, i32 undef)
; CHECK-NEXT: [[ADD_SPLIT8:%[^ ]+]] = add <8 x i32> [[LOAD_SPLIT8]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[ADD_SPLIT8_JOIN8:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> [[ADD_SPLIT0_JOIN0]], <8 x i32> [[ADD_SPLIT8]], i32 0, i32 8, i32 1, i16 32, i32 undef, i1 true)
; CHECK-NEXT: [[RDREGION:%[^ ]+]] = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[ADD_SPLIT8_JOIN8]], i32 0, i32 8, i32 1, i16 0, i32 undef)
; CHECK-NEXT: tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> [[RDREGION]])
define void @test1() {
%load = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0)
%add = add <16 x i32> %load, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%rdregion = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> %add, i32 0, i32 8, i32 1, i16 0, i32 undef)
tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> %rdregion)
ret void
}

; CHECK-LABEL: @test2
; CHECK: [[LOAD:%[^ ]+]] = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0)
; CHECK-NEXT: [[LOAD_SPLIT0:%[^ ]+]] = call <4 x i32> @llvm.genx.rdregioni.v4i32.v16i32.i16(<16 x i32> [[LOAD]], i32 4, i32 4, i32 1, i16 0, i32 undef)
; CHECK-NEXT: [[ADD_SPLIT0:%[^ ]+]] = add <4 x i32> [[LOAD_SPLIT0]], <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[ADD_SPLIT0_JOIN0:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v4i32.i16.i1(<16 x i32> undef, <4 x i32> [[ADD_SPLIT0]], i32 0, i32 4, i32 1, i16 0, i32 undef, i1 true)
; CHECK-NEXT: [[LOAD_SPLIT4:%[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[LOAD]], i32 8, i32 8, i32 1, i16 16, i32 undef)
; CHECK-NEXT: [[ADD_SPLIT4:%[^ ]+]] = add <8 x i32> [[LOAD_SPLIT4]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[ADD_SPLIT4_JOIN4:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> [[ADD_SPLIT0_JOIN0]], <8 x i32> [[ADD_SPLIT4]], i32 0, i32 8, i32 1, i16 16, i32 undef, i1 true)
; CHECK-NEXT: [[LOAD_SPLIT12:%[^ ]+]] = call <4 x i32> @llvm.genx.rdregioni.v4i32.v16i32.i16(<16 x i32> [[LOAD]], i32 4, i32 4, i32 1, i16 48, i32 undef)
; CHECK-NEXT: [[ADD_SPLIT12:%[^ ]+]] = add <4 x i32> [[LOAD_SPLIT12]], <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[ADD_SPLIT12_JOIN12:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v4i32.i16.i1(<16 x i32> [[ADD_SPLIT4_JOIN4]], <4 x i32> [[ADD_SPLIT12]], i32 0, i32 4, i32 1, i16 48, i32 undef, i1 true)
; CHECK-NEXT: [[RDREGION:%[^ ]+]] = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[ADD_SPLIT12_JOIN12]], i32 0, i32 8, i32 1, i16 16, i32 undef)
; CHECK-NEXT: tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> [[RDREGION]])
define void @test2() {
%load = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0)
%add = add <16 x i32> %load, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%rdregion = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> %add, i32 0, i32 8, i32 1, i16 16, i32 undef)
tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> %rdregion)
ret void
}

; CHECK-LABEL: @test3
; CHECK: [[LOAD:%[^ ]+]] = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0)
; CHECK-NEXT: [[LOAD_SPLIT0:%[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[LOAD]], i32 8, i32 8, i32 1, i16 0, i32 undef)
; CHECK-NEXT: [[ADD_SPLIT0:%[^ ]+]] = add <8 x i32> [[LOAD_SPLIT0]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[ADD_SPLIT0_JOIN0:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> undef, <8 x i32> [[ADD_SPLIT0]], i32 0, i32 8, i32 1, i16 0, i32 undef, i1 true)
; CHECK-NEXT: [[LOAD_SPLIT8:%[^ ]+]] = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[LOAD]], i32 8, i32 8, i32 1, i16 32, i32 undef)
; CHECK-NEXT: [[ADD_SPLIT8:%[^ ]+]] = add <8 x i32> [[LOAD_SPLIT8]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[ADD_SPLIT8_JOIN8:%[^ ]+]] = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.i1(<16 x i32> [[ADD_SPLIT0_JOIN0]], <8 x i32> [[ADD_SPLIT8]], i32 0, i32 8, i32 1, i16 32, i32 undef, i1 true)
; CHECK-NEXT: [[RDREGION:%[^ ]+]] = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[ADD_SPLIT8_JOIN8]], i32 0, i32 8, i32 1, i16 32, i32 undef)
; CHECK-NEXT: tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> [[RDREGION]])
define void @test3() {
%load = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0)
%add = add <16 x i32> %load, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%rdregion = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> %add, i32 0, i32 8, i32 1, i16 32, i32 undef)
tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> %rdregion)
ret void
}

; CHECK-LABEL: @test4
; CHECK: [[LOAD:%[^ ]+]] = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0)
; CHECK-NEXT: [[ADD:%[^ ]+]] = add <16 x i32> [[LOAD]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[RDREGION:%[^ ]+]] = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> [[ADD]], i32 0, i32 8, i32 2, i16 0, i32 undef)
; CHECK-NEXT: tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> [[RDREGION]])
define void @test4() {
%load = tail call <16 x i32> @llvm.genx.oword.ld.v16i32(i32 0, i32 1, i32 0)
%add = add <16 x i32> %load, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%rdregion = tail call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.i16(<16 x i32> %add, i32 0, i32 8, i32 2, i16 0, i32 undef)
tail call void @llvm.genx.oword.st.v8i32(i32 2, i32 0, <8 x i32> %rdregion)
ret void
}

0 comments on commit e41de38

Please sign in to comment.