From 5cb62aea92218a82bcad3d787af4e6d4e96ca4a1 Mon Sep 17 00:00:00 2001 From: "Wesierski, Lukasz" Date: Mon, 31 Jul 2023 16:56:18 +0000 Subject: [PATCH] Add support for emitSimdMediaBlock[Read|Write] for bigger types Add support for emitSimdMediaBlock[Read|Write] for wider vectors like ushort16 --- IGC/Compiler/CISACodeGen/EmitVISAPass.cpp | 310 ++++++++++++++++------ 1 file changed, 222 insertions(+), 88 deletions(-) diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp index 6420858153e9..9890f84ae08d 100644 --- a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp +++ b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp @@ -6635,7 +6635,7 @@ void EmitPass::emitSimdMediaBlockRead(llvm::Instruction* inst) { nbElements = (uint32_t)cast(inst->getType())->getNumElements(); } - IGC_ASSERT_MESSAGE(nbElements <= 8, "InValid Vector Size"); + IGC_ASSERT_MESSAGE(nbElements <= 16, "InValid Vector Size"); int SrcImgBTI = int_cast(GetImmediateVal(inst->getOperand(0))); int isImageTypeUAV = int_cast(GetImmediateVal(inst->getOperand(3))); @@ -6646,13 +6646,18 @@ void EmitPass::emitSimdMediaBlockRead(llvm::Instruction* inst) uint32_t typeSizeInBytes = inst->getType()->getScalarType()->getScalarSizeInBits() / 8; uint32_t totalWidth = typeSizeInBytes * numLanes(m_SimdMode); - uint32_t pass = 0; - uint32_t numPasses = 0; + uint32_t pass_axisX = 0; + uint32_t numPasses_axisX = 0; + + uint32_t pass_axisY = 0; + uint32_t numPasses_axisY = 0; + uint32_t bindingTableIndex = 0; uint32_t dstSubReg = 0; uint32_t blockWidth = 0; uint32_t blockHeight = nbElements; + uint32_t blockHeight_step = 0; if (isImageTypeUAV) { @@ -6670,14 +6675,14 @@ void EmitPass::emitSimdMediaBlockRead(llvm::Instruction* inst) if (totalWidth < maxWidth) { - numPasses = 1; + numPasses_axisX = 1; blockWidth = totalWidth; } else { IGC_ASSERT(maxWidth); IGC_ASSERT_MESSAGE(totalWidth % maxWidth == 0, "Total width must be divisible by 32!"); - numPasses = totalWidth / maxWidth; + numPasses_axisX = totalWidth / maxWidth; blockWidth = maxWidth; } @@ -6687,19 +6692,19 @@ void EmitPass::emitSimdMediaBlockRead(llvm::Instruction* inst) if (totalWidth > 32 && blockHeight <= 4 && (totalWidth % maxWidth == 0)) { // do 64 byte wide read - numPasses = totalWidth / maxWidth; + numPasses_axisX = totalWidth / maxWidth; blockWidth = maxWidth; } } - CVariable* pTempVar0 = nullptr; - CVariable* pTempVar = nullptr; + CVariable* pTempVar_axis_X_offset = nullptr; + CVariable* pTempVar_axis_Y_offset = nullptr; uint32_t blockRegSize = 0; //Following variable declaration is SIMD8 based, UD is used, so blockRegSize is total required registers. auto simdMode = lanesToSIMDMode(blockWidth / typeSizeInBytes); - blockRegSize = numPasses * blockHeight * numLanes(simdMode); + blockRegSize = numPasses_axisX * blockHeight * numLanes(simdMode); CVariable* pTempDest = m_currShader->GetNewVariable( blockRegSize, @@ -6710,6 +6715,28 @@ void EmitPass::emitSimdMediaBlockRead(llvm::Instruction* inst) CVariable* xVar = GetSymbol(xOffset); CVariable* yVar = GetSymbol(yOffset); + int scale = blockWidth / getGRFSize(); + + const int MediaBlockReadByteMax = 256; + int totalBytesToLoadOneSend = (blockRegSize * typeSizeInBytes) / numPasses_axisX; + + if (totalBytesToLoadOneSend > MediaBlockReadByteMax) + { + numPasses_axisY = totalBytesToLoadOneSend / MediaBlockReadByteMax; + // Slice the block horizontally by number of axisY passes + blockHeight_step = blockHeight / numPasses_axisY; + } + else + { + numPasses_axisY = 1; + } + + int blockHeight2Load = numPasses_axisY > 1 ? + // For case when one send cannot handle the whole load + blockHeight_step : + // For normal case + blockHeight; + // Emits a MEDIA_BLOCK_READ instruction. // Considering block width as x-axis and block height as y axis: // Pass 0 reads from (xOffset,yOffset) to (xOffset+31, yOffset+blockheight) @@ -6734,68 +6761,102 @@ void EmitPass::emitSimdMediaBlockRead(llvm::Instruction* inst) // | | // --------- // 32 or 64 bytes at most, that's the reason simd8 is used. - - int scale = blockWidth / getGRFSize(); - - for (pass = 0; pass < numPasses; pass++) + for (pass_axisX = 0; pass_axisX < numPasses_axisX; pass_axisX++) { m_encoder->SetSimdSize(SIMDMode::SIMD1); m_encoder->SetNoMask(); m_encoder->SetSrcRegion(0, 0, 1, 0); - if (pass == 0) + if (pass_axisX == 0) { - pTempVar0 = m_currShader->GetNewVariable( + pTempVar_axis_X_offset = m_currShader->GetNewVariable( numLanes(m_SimdMode), ISA_TYPE_UD, EALIGN_DWORD, CName::NONE); - m_encoder->Copy(pTempVar0, xVar); + m_encoder->Copy(pTempVar_axis_X_offset, xVar); } else { - m_encoder->Add(pTempVar0, pTempVar0, m_currShader->ImmToVariable(blockWidth, ISA_TYPE_UD)); - uint32_t subOffset = blockWidth * blockHeight; + m_encoder->Add(pTempVar_axis_X_offset, pTempVar_axis_X_offset, m_currShader->ImmToVariable(blockWidth, ISA_TYPE_UD)); + uint32_t subOffset = blockWidth * blockHeight2Load; subOffset /= getGRFSize(); dstSubReg = dstSubReg + subOffset; } m_encoder->Push(); - m_encoder->SetSimdSize(SIMDMode::SIMD1); - m_encoder->SetNoMask(); - m_encoder->SetSrcRegion(0, 0, 1, 0); + // In case when we wanted to load more than MediaBlockLoadByteMax=256 data in + // one send instruction. For such scenario we are slicing the data horizontally, example: + // + // media_ld : grf_0 [.........] + // grf_1 [.........] + // ... + // grf_x [.........] + // ... + // grf_15 [.........] + // In such example we wanted to load 512 bytes - the limitation is 256 bytes, so slice + // input data for two sends: + // + // media_ld_0 : grf_0 [.........] + // grf_1 [.........] + // ... + // grf_x [.........] + // ... + // grf_7 [.........] + // media_ld_1 : grf_8 [.........] + // grf_9 [.........] + // ... + // grf_x [.........] + // ... + // grf_15 [.........] + for (pass_axisY = 0; pass_axisY < numPasses_axisY; ++pass_axisY) + { + m_encoder->SetSimdSize(SIMDMode::SIMD1); + m_encoder->SetNoMask(); + m_encoder->SetSrcRegion(0, 0, 1, 0); - pTempVar = m_currShader->GetNewVariable( - numLanes(m_SimdMode), - ISA_TYPE_UD, - EALIGN_DWORD, - CName::NONE); + if (pass_axisY == 0) + { + pTempVar_axis_Y_offset = m_currShader->GetNewVariable( + numLanes(m_SimdMode), + ISA_TYPE_UD, + EALIGN_DWORD, + CName::NONE); - m_encoder->Copy(pTempVar, yVar); - m_encoder->Push(); + m_encoder->Copy(pTempVar_axis_Y_offset, yVar); + } + else + { + m_encoder->Add(pTempVar_axis_Y_offset, pTempVar_axis_Y_offset, m_currShader->ImmToVariable(blockHeight_step, ISA_TYPE_UD)); + uint32_t subOffset = (MediaBlockReadByteMax * pass_axisY) / numPasses_axisX; + subOffset /= getGRFSize(); + dstSubReg = dstSubReg + subOffset; + } + m_encoder->Push(); - m_encoder->SetDstSubVar(dstSubReg); + m_encoder->SetDstSubVar(dstSubReg); - CVariable* dstVar = numPasses == 1 ? m_destination : pTempDest; + CVariable* dstVar = numPasses_axisX == 1 ? m_destination : pTempDest; - { - m_encoder->MediaBlockMessage( - ISA_Opcode::ISA_MEDIA_LD, - dstVar, - ESURFACE_NORMAL, - srcbti, - pTempVar0, - pTempVar, - 0, - (unsigned char)blockWidth, - (unsigned char)blockHeight, - 0); + { + m_encoder->MediaBlockMessage( + ISA_Opcode::ISA_MEDIA_LD, + dstVar, + ESURFACE_NORMAL, + srcbti, + pTempVar_axis_X_offset, + pTempVar_axis_Y_offset, + 0, + (unsigned char)blockWidth, + (unsigned char)blockHeight2Load, + 0); + } + m_encoder->Push(); } - m_encoder->Push(); } - if (numPasses > 1) + if (numPasses_axisX > 1) { dstSubReg = 0; @@ -6848,7 +6909,7 @@ void EmitPass::emitSimdMediaBlockRead(llvm::Instruction* inst) uint32_t dstSubRegOffset = 0; uint32_t srcSubRegOffset = 0; - for (uint32_t pass = 0; pass < numPasses; pass++) //Width + for (uint32_t pass = 0; pass < numPasses_axisX; pass++) //Width { m_encoder->SetSimdSize(simdMode); m_encoder->SetNoMask(); @@ -6889,7 +6950,7 @@ void EmitPass::emitSimdMediaBlockWrite(llvm::Instruction* inst) { nbElements = (uint32_t)cast(dataPtr->getType())->getNumElements(); } - IGC_ASSERT_MESSAGE(nbElements <= 8, "InValid Vector Size"); + IGC_ASSERT_MESSAGE(nbElements <= 16, "InValid Vector Size"); CVariable* data = GetSymbol(dataPtr); data = BroadcastIfUniform(data); @@ -6897,11 +6958,15 @@ void EmitPass::emitSimdMediaBlockWrite(llvm::Instruction* inst) uint32_t typeSizeInBytes = dataPtr->getType()->getScalarType()->getScalarSizeInBits() / 8; uint32_t totalWidth = typeSizeInBytes * numLanes(m_SimdMode); - uint32_t pass = 0; - uint32_t numPasses = 0; + uint32_t pass_axisX = 0; + uint32_t numPasses_axisX = 0; + + uint32_t pass_axisY = 0; + uint32_t numPasses_axisY = 0; uint32_t blockWidth = 0; uint32_t blockHeight = nbElements; + uint32_t blockHeight_step = 0; uint32_t bindingTableIndex = 0; if (isImageTypeUAV) @@ -6920,14 +6985,14 @@ void EmitPass::emitSimdMediaBlockWrite(llvm::Instruction* inst) if (totalWidth < maxWidth) { - numPasses = 1; + numPasses_axisX = 1; blockWidth = totalWidth; } else { IGC_ASSERT(maxWidth); IGC_ASSERT_MESSAGE(totalWidth % maxWidth == 0, "Total width must be divisible by 32!"); - numPasses = totalWidth / maxWidth; + numPasses_axisX = totalWidth / maxWidth; blockWidth = maxWidth; } @@ -6937,23 +7002,40 @@ void EmitPass::emitSimdMediaBlockWrite(llvm::Instruction* inst) if (totalWidth > 32 && blockHeight <= 4 && totalWidth % maxWidth == 0) { // do 64 byte wide read - numPasses = totalWidth / maxWidth; + numPasses_axisX = totalWidth / maxWidth; blockWidth = maxWidth; } } - CVariable* pTempVar0 = nullptr; - CVariable* pTempVar = nullptr; + CVariable* pTempVar_axis_X_offset = nullptr; + CVariable* pTempVar_axis_Y_offset = nullptr; uint32_t dstSubReg = 0; + uint32_t srcSubReg = 0; int scale = blockWidth / getGRFSize(); auto simdMode = lanesToSIMDMode(blockWidth / typeSizeInBytes); - for (pass = 0; pass < numPasses; pass++) + + const int MediaBlockStoreByteMax = 256; + int blockRegSize = numPasses_axisX * blockHeight * numLanes(simdMode); + int totalBytesToStore = (blockRegSize * typeSizeInBytes) / numPasses_axisX; + + if (totalBytesToStore > MediaBlockStoreByteMax) + { + numPasses_axisY = totalBytesToStore / MediaBlockStoreByteMax; + // Slice the block horizontally by number of axisY passes + blockHeight_step = blockHeight / numPasses_axisY; + } + else { - uint32_t srcSubVar = pass * blockWidth / getGRFSize(); + numPasses_axisY = 1; + } + + for (pass_axisX = 0; pass_axisX < numPasses_axisX; pass_axisX++) + { + uint32_t srcSubVar = pass_axisX * blockWidth / getGRFSize(); uint32_t dstSubVar = 0; - uint32_t srcSubRegOffset = (pass * blockWidth) % getGRFSize(); + uint32_t srcSubRegOffset = (pass_axisX * blockWidth) % getGRFSize(); uint32_t dstSubRegOffset = 0; CVariable* tempdst = nullptr; @@ -6992,7 +7074,7 @@ void EmitPass::emitSimdMediaBlockWrite(llvm::Instruction* inst) // mov (8) r31.8<1>:d r66.8<8;8,1>:d {Align1, Q1, Compacted} //... - if (numPasses > 1) + if (numPasses_axisX > 1) { for (uint i = 0; i < nbElements; ++i) { @@ -7011,7 +7093,7 @@ void EmitPass::emitSimdMediaBlockWrite(llvm::Instruction* inst) { dstSubVar += scale > 0 ? scale : 1; } - srcSubVar = srcSubVar + (numPasses * blockWidth / getGRFSize()); + srcSubVar = srcSubVar + (numPasses_axisX * blockWidth / getGRFSize()); m_encoder->Copy(tempdst, data); m_encoder->Push(); @@ -7032,33 +7114,20 @@ void EmitPass::emitSimdMediaBlockWrite(llvm::Instruction* inst) // mov (16) r29.0<1>:ud r22.0<8;8,1>:ud {Align1, NoMask, Compacted} // mov (16) r31.0<1>:ud r24.0<8;8,1>:ud {Align1, NoMask, Compacted} // send (8) null<1>:ud r28 0xc 0xa0a8002:ud{Align1, NoMask} // media block write - if (pass == 0) + if (pass_axisX == 0) { CVariable* xVar = GetSymbol(xOffset); - CVariable* yVar = GetSymbol(yOffset); - m_encoder->SetSimdSize(SIMDMode::SIMD1); - m_encoder->SetNoMask(); - m_encoder->SetSrcRegion(0, 0, 1, 0); - - pTempVar0 = m_currShader->GetNewVariable( - numLanes(m_SimdMode), - ISA_TYPE_D, - EALIGN_DWORD, - CName::NONE); - - m_encoder->Cast(pTempVar0, xVar); - m_encoder->Push(); m_encoder->SetSimdSize(SIMDMode::SIMD1); m_encoder->SetNoMask(); m_encoder->SetSrcRegion(0, 0, 1, 0); - pTempVar = m_currShader->GetNewVariable( + pTempVar_axis_X_offset = m_currShader->GetNewVariable( numLanes(m_SimdMode), ISA_TYPE_D, EALIGN_DWORD, CName::NONE); - m_encoder->Cast(pTempVar, yVar); + m_encoder->Cast(pTempVar_axis_X_offset, xVar); m_encoder->Push(); } else @@ -7066,26 +7135,91 @@ void EmitPass::emitSimdMediaBlockWrite(llvm::Instruction* inst) m_encoder->SetSimdSize(SIMDMode::SIMD1); m_encoder->SetNoMask(); m_encoder->SetSrcRegion(0, 0, 1, 0); - m_encoder->Add(pTempVar0, pTempVar0, m_currShader->ImmToVariable(blockWidth, ISA_TYPE_UD)); + m_encoder->Add(pTempVar_axis_X_offset, pTempVar_axis_X_offset, m_currShader->ImmToVariable(blockWidth, ISA_TYPE_UD)); m_encoder->Push(); dstSubReg = dstSubReg + scale * blockHeight; } + // In case when we wanted to store more than MediaBlockStoreByteMax=256 data in + // one send instruction. For such scenario we are slicing the data horizontally, example: + // + // media_st : grf_0 [.........] + // grf_1 [.........] + // ... + // grf_x [.........] + // ... + // grf_15 [.........] + // In such example we wanted to store 512 bytes - the limitation is 256 bytes, so slice + // input data for two sends: + // + // media_st_0 : grf_0 [.........] + // grf_1 [.........] + // ... + // grf_x [.........] + // ... + // grf_7 [.........] + // media_st_1 : grf_8 [.........] + // grf_9 [.........] + // ... + // grf_x [.........] + // ... + // grf_15 [.........] + for (pass_axisY = 0; pass_axisY < numPasses_axisY; ++pass_axisY) + { + m_encoder->SetSimdSize(SIMDMode::SIMD1); + m_encoder->SetNoMask(); + m_encoder->SetSrcRegion(0, 0, 1, 0); - m_encoder->SetDstSubVar(dstSubReg); + CVariable* yVar = GetSymbol(yOffset); + if (pass_axisY == 0) + { + pTempVar_axis_Y_offset = m_currShader->GetNewVariable( + numLanes(m_SimdMode), + ISA_TYPE_D, + EALIGN_DWORD, + CName::NONE); - { - m_encoder->MediaBlockMessage( - ISA_Opcode::ISA_MEDIA_ST, - tempdst, ESURFACE_NORMAL, - srcbti, - pTempVar0, - pTempVar, - 0, - (unsigned char)blockWidth, - (unsigned char)blockHeight, - 0); + m_encoder->Cast(pTempVar_axis_Y_offset, yVar); + m_encoder->Push(); + } + else + { + m_encoder->Add(pTempVar_axis_Y_offset, pTempVar_axis_Y_offset, m_currShader->ImmToVariable(blockHeight_step, ISA_TYPE_UD)); + m_encoder->Push(); + uint32_t subOffset = (MediaBlockStoreByteMax * pass_axisY) / numPasses_axisX; + srcSubReg = srcSubReg + subOffset; + + // Offset the source for next store instr + tempdst = + m_currShader->GetNewAlias( + tempdst, + tempdst->GetType(), + srcSubReg, + tempdst->GetNumberElement()); + } + + m_encoder->SetDstSubVar(dstSubReg); + + int blockHeight2Store = numPasses_axisY > 1 ? + // For case when one send cannot handle the whole store + blockHeight_step : + // For normal case + blockHeight; + + { + m_encoder->MediaBlockMessage( + ISA_Opcode::ISA_MEDIA_ST, + tempdst, + ESURFACE_NORMAL, + srcbti, + pTempVar_axis_X_offset, + pTempVar_axis_Y_offset, + 0, + (unsigned char)blockWidth, + (unsigned char)blockHeight2Store, + 0); + } + m_encoder->Push(); } - m_encoder->Push(); } }