Skip to content

Commit

Permalink
MNN:Bugfix: Fix bug for raster blit with multi-thread MNN:Bugfix: Fix…
Browse files Browse the repository at this point in the history
… bug for opencl not reset and regionfuse
  • Loading branch information
xiaying committed Aug 27, 2024
1 parent 5e93be1 commit 9f38e39
Show file tree
Hide file tree
Showing 8 changed files with 219 additions and 74 deletions.
100 changes: 53 additions & 47 deletions source/backend/cpu/CPURaster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,32 @@

using Vec4 = MNN::Math::Vec<float, 4>;
namespace MNN {
struct ReduceInfo {
int reduceMask[3] = {0, 0, 0};
int reduceNum = 0;
int reduceIndex[3];
int normalIndex[3];
int normalNum = 0;
bool compute(const Tensor::InsideDescribe::Region& slice) {
normalNum = 0;
reduceNum = 0;
for (int i=0; i<3; ++i) {
if (slice.size[i] > 1 && slice.dst.stride[i] == 0) {
reduceMask[i] = 1;
reduceIndex[reduceNum] = i;
reduceNum ++;
} else {
MNN_ASSERT(normalNum < 3);
normalIndex[normalNum] = i;
normalNum++;
}
}
if (0 == reduceNum) {
return false;
}
return true;
}
};

ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std::vector<Tensor *> &outputs) {
MNN_ASSERT(outputs.size() == 1);
Expand Down Expand Up @@ -138,7 +164,6 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
}
}
auto cache = static_cast<CPUBackend*>(backend())->getCache();
#if 1
auto tempTensor = cache->findCacheTensor(origin, midFormat);
//MNN_ASSERT(CPUBackend::getBytes(backend(), origin) == 4);
if (nullptr == tempTensor) {
Expand All @@ -160,22 +185,6 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) {
forRelease.emplace_back(tempTensor);
}
#else
std::shared_ptr<Tensor> newTensor(new Tensor);
TensorUtils::copyShape(origin, newTensor.get());
TensorUtils::getDescribe(newTensor.get())->dimensionFormat = midFormat;
TensorUtils::getDescribe(newTensor.get())->quantAttr = TensorUtils::getDescribe(origin)->quantAttr;
newTensor->buffer().type = origin->getType();
TensorUtils::setLinearLayout(newTensor.get());
mTempInput.insert(std::make_pair(origin, newTensor.get()));
auto res = backend()->onAcquireBuffer(newTensor.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
auto tempTensor = newTensor.get();
backend()->onReleaseBuffer(tempTensor, Backend::DYNAMIC);
cache->pushCacheTensor(newTensor, origin, midFormat);
#endif
mTempInputCopy.emplace_back(std::make_pair(tempTensor, &slice));
}
for (auto t : forRelease) {
Expand All @@ -185,7 +194,15 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
}
auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
if (mTempInputCopy.size() == 1 && threadNumber > 1) {
mHasReduce = false;
ReduceInfo reduceInfo;
for (auto& iter : mTempInputCopy) {
if (reduceInfo.compute(*iter.second)) {
mHasReduce = true;
break;
}
}
if (mTempInputCopy.size() == 1 && threadNumber > 1 && (!mHasReduce)) {
// Split to multi region
auto region = mTempInputCopy[0].second;
const int thredHold = 100;//TODO: Find better way to determine it
Expand Down Expand Up @@ -396,25 +413,11 @@ static void _zero(const Tensor::InsideDescribe::Region& slice, int bytes, uint8_
}
}
static bool _reduceblit(const Tensor::InsideDescribe::Region& slice, int bytes, const uint8_t* srcPtr, uint8_t* dstPtr) {
int reduceMask[3] = {0, 0, 0};
int reduceNum = 0;
int reduceIndex[3];
int normalIndex[3];
int normalNum = 0;
for (int i=0; i<3; ++i) {
if (slice.size[i] > 1 && slice.dst.stride[i] == 0) {
reduceMask[i] = 1;
reduceIndex[reduceNum] = i;
reduceNum ++;
} else {
normalIndex[normalNum] = i;
normalNum++;
}
}
if (0 == reduceNum) {
return false;
}
switch (reduceNum) {
ReduceInfo reduceInfo;
reduceInfo.compute(slice);
auto normalIndex = reduceInfo.normalIndex;
auto reduceIndex = reduceInfo.reduceIndex;
switch (reduceInfo.reduceNum) {
case 3:
{
float summer = 0.0f;
Expand Down Expand Up @@ -490,14 +493,13 @@ static bool _reduceblit(const Tensor::InsideDescribe::Region& slice, int bytes,
return false;
}

static void _blit(const Tensor::InsideDescribe::Region& slice, int bytes, const uint8_t* srcPtr, uint8_t* dstPtr) {
static void _blit(const Tensor::InsideDescribe::Region& slice, int bytes, const uint8_t* srcPtr, uint8_t* dstPtr, bool hasReduce) {
auto proc = _selectUnitProc(bytes, slice.src.stride[2], slice.dst.stride[2]);
#define MNN_BLIT_SUPPORT_REDUCE
#ifdef MNN_BLIT_SUPPORT_REDUCE
if (_reduceblit(slice, bytes, srcPtr, dstPtr)) {
return;
if (hasReduce) {
if (_reduceblit(slice, bytes, srcPtr, dstPtr)) {
return;
}
}
#endif
if (slice.src.stride[1] == slice.size[2] && slice.dst.stride[1] == slice.size[2] && slice.src.stride[2] == 1) {
for (int z=0; z<slice.size[0]; ++z) {
auto srcZ = srcPtr + z * slice.src.stride[0] * bytes;
Expand Down Expand Up @@ -624,13 +626,17 @@ ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &____inputs, const st
tensorConvert(iter.first, iter.second, bytes);
}
threadNum = ALIMIN(threadNum, (int)mTempInputCopy.size());
if (mHasReduce) {
// Don't support reduce with multi thread now
threadNum = 1;
}
MNN_CONCURRENCY_BEGIN(tId, threadNum) {
for (int u=tId; u<mTempInputCopy.size(); u+=threadNum) {
auto& iter = mTempInputCopy[u];
auto& slice = *(iter.second);
auto srcPtr = iter.first->host<uint8_t>() + slice.src.offset * bytes;
auto dstPtr = (uint8_t*)mOutputPtr + slice.dst.offset * bytes;
_blit(slice, bytes, srcPtr, dstPtr);
_blit(slice, bytes, srcPtr, dstPtr, mHasReduce);
}
}
MNN_CONCURRENCY_END();
Expand Down Expand Up @@ -807,7 +813,7 @@ class CPULoop : public Execution {
if (halide_type_float == input->getType().code) {
bytes = cpubackend->functions()->bytes;
}
_blit(reg, bytes, input->host<uint8_t>(), output->host<uint8_t>());
_blit(reg, bytes, input->host<uint8_t>(), output->host<uint8_t>(), false);
}

}
Expand Down Expand Up @@ -855,7 +861,7 @@ class CPULoop : public Execution {
auto dstOffset = dstIter * step0 + dstView->offset();
if (dstOffset >= 0) {
if (srcOffset >= 0 && srcOffset < inputSize) {
_blit(reg, bytes, input->host<uint8_t>() + bytes * srcOffset, output->host<uint8_t>() + bytes * dstOffset);
_blit(reg, bytes, input->host<uint8_t>() + bytes * srcOffset, output->host<uint8_t>() + bytes * dstOffset, false);
} else {
_zero(reg, bytes, output->host<uint8_t>() + bytes * dstOffset);
}
Expand Down Expand Up @@ -921,7 +927,7 @@ class CPULoop : public Execution {
auto step0 = cmd->steps()->data()[0];
auto step1 = cmd->steps()->data()[1];
auto loopNumber = mLoop->loopNumber();
_blit(reg, bytes, (const uint8_t*)src, (uint8_t*)dst);
_blit(reg, bytes, (const uint8_t*)src, (uint8_t*)dst, false);
break;
}
auto proc = static_cast<CPUBackend*>(backend())->functions()->MNNSelectUnaryFunctionForFloat(op->main_as_UnaryOp()->opType(), static_cast<CPUBackend*>(backend())->precisionMode());
Expand Down
1 change: 1 addition & 0 deletions source/backend/cpu/CPURaster.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class CPURaster : public Execution {
OpCommonUtils::TensorConvertParameter mSingleConvert;
std::vector<std::shared_ptr<Tensor::InsideDescribe::Region>> mCacheRegions;
int32_t mZeroPoint = 0;
bool mHasReduce = false;
};
}
#endif
3 changes: 3 additions & 0 deletions source/backend/opencl/core/OpenCLBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ void CLRuntime::onMaskOpReady(const std::vector<Tensor*>& inputs, const std::vec
dstInfo->mInfos.emplace_back(std::move(opInfo));
}
}
void CLRuntime::onReset(int numberThread, const BackendConfig* config, bool full) {
mOpenCLRuntime->setGpuMode(numberThread);
}

bool CLRuntime::onSetCache(const void* buffer, size_t size) {
if (nullptr == buffer) {
Expand Down
1 change: 1 addition & 0 deletions source/backend/opencl/core/OpenCLBackend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class CLRuntime : public Runtime {
virtual ~CLRuntime();

virtual Backend* onCreate(const BackendConfig* config) const override;
virtual void onReset(int numberThread, const BackendConfig* config, bool full) override;
virtual void onGabageCollect(int level) override;
virtual float onGetMemoryInMB() override;
virtual std::pair<const void*, size_t> onGetCache() override;
Expand Down
2 changes: 1 addition & 1 deletion source/backend/opencl/core/runtime/OpenCLRuntime.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,11 @@ class OpenCLRuntime {

std::pair<const void*, size_t> makeCache(void* tuneInfo);
bool setCache(std::pair<const void*, size_t> cache);
void setGpuMode(const int cl_mode_num);
private:
bool loadProgram(const std::string &programName, cl::Program *program);
bool buildProgram(const std::string &buildOptionsStr, cl::Program *program);
bool getDeviceSupportsExtension(const cl::Device &device, const char *extensionName);
void setGpuMode(const int cl_mode_num);

private:
std::vector<size_t> mMaxImageSize;
Expand Down
43 changes: 18 additions & 25 deletions source/core/TensorUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -544,15 +544,30 @@ static bool _ClipDst(int* stride, int srcOffset, int dstOffset, const int* srcSi
dstMin[i] = ALIMAX(0, -o[i]);
dstMax[i] = ALIMIN(srcSize[i]-o[i], dstSize[i]);
}
int srcMin = -1;
for (int i=0; i<sizeNum; ++i) {
if (dstMax[i] < srcSize[i]) {
if (srcMin == -1) {
srcMin = stride[i];
} else {
srcMin = ALIMIN(stride[i], srcMin);
}
}
}
if (srcMin < 0) {
// Src is fully used
return true;
}

// Check If dstMax is inside src, it means one region can't describe dst - src
// TODO: Support slice region to support fuse
for (int i=0; i<sizeNum; ++i) {
if (dstMax[i] == dstSize[i]) {
continue;
}
int bias = offsetBias + dstMax[i] * stride[i];
if (bias < srcMax) {
// for [dstMax, dstSize], exist value match formula
if (bias < srcMax && bias >= srcMin) {
// for [dstMax, dstSize], may exist value match formula
return false;
}
}
Expand All @@ -578,7 +593,6 @@ class TensorUtils::FuseRegionStatus {
public:
enum Status {
FUSE_SRC_COPY,
FUSE_DST_COPY,
FUSE_REGION_COMPUTE
};
void apply(const Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg) {
Expand All @@ -587,16 +601,6 @@ class TensorUtils::FuseRegionStatus {
dstReg.origin = srcReg.origin;
dstReg.src.offset += srcReg.src.offset - srcReg.dst.offset;
break;
case FUSE_DST_COPY:
dstReg.origin = srcReg.origin;
dstReg.dst = srcReg.dst;
dstReg.src = srcReg.src;
dstReg.src.offset = mSrcOff;
dstReg.dst.offset = mDstOff;
dstReg.size[0] = srcReg.size[0];
dstReg.size[1] = srcReg.size[1];
dstReg.size[2] = srcReg.size[2];
break;
case FUSE_REGION_COMPUTE:
{
if (dstSize[0] == 0) {
Expand All @@ -612,7 +616,7 @@ class TensorUtils::FuseRegionStatus {
int valid[3] = {0, 0, 0};
int offset = 3 - dstNum;
if (dstNum > sizeNum) {
for (int i = 2; i >= 0; i--) {
for (int i = dstNum - 1; i >= 0; i--) {
if (i < dstNum) {
if (dstSize[i] == 1) {
expandIdx = i;
Expand Down Expand Up @@ -691,17 +695,6 @@ class TensorUtils::FuseRegionStatus {
mStatus = FUSE_SRC_COPY;
return true;
}
// dst copy fuse
if (isCopyRegion(dstReg) && dstTotalSize == srcTotalSize && copyValid) {
mSrcOff = dstReg.src.offset - srcReg.dst.offset;
mDstOff = dstReg.dst.offset;
mSrcOff = offsetCompute(srcReg, dstReg.src.offset, srcReg.dst.offset, true) + srcReg.src.offset;
if (!(srcReg.src.stride[2] > 0 && mSrcOff % srcReg.src.stride[2] != 0)) {
// when transpose + slice, offset is not align can't fuse
mStatus = FUSE_DST_COPY;
return true;
}
}
#define MNN_3_INT_INIT(x, y) { x[0] = y; x[1] = y; x[2] = y; }
MNN_3_INT_INIT(dstStride, -1)
MNN_3_INT_INIT(srcStride, -1)
Expand Down
6 changes: 5 additions & 1 deletion test/core/RegionFuse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class RegionFuseTest : public MNNTestCase {
using Region = Tensor::InsideDescribe::Region;
virtual ~RegionFuseTest() = default;
virtual bool run(int precision) {
constexpr int N = 11;
constexpr int N = 12;
// [src_offset, src_stride_0_1_2, dst_offset, dst_stride_0_1_2, size_0_1_2]
int data[N*3][11] = {
// 2D-transpose + 2D-transpose = memcpy: [1, 4, 16] => [1, 16, 4] => [1, 4, 16]
Expand Down Expand Up @@ -64,6 +64,10 @@ class RegionFuseTest : public MNNTestCase {
{0, 12321, 111, 1, 0, 12544, 112, 1, 32, 111, 111},
{113, 12544, 112, 1, 0, 12321, 111, 1, 32, 111, 111},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
// concat + stack
{0, 32, 1, 1, 32, 64, 1, 1, 20, 32, 1},
{0, 0, 1280, 1, 0, 1, 640, 1, 1, 1, 640},
{0, 0, 32, 1, 32, 0, 64, 1, 1, 10, 32},
};
TensorUtils::FuseWrap fuseUtils;
for (int i = 0; i < N; i++) {
Expand Down
Loading

0 comments on commit 9f38e39

Please sign in to comment.