MNN:Bugfix: Fix bug for raster blit with multi-thread MNN:Bugfix: Fix…

… bug for opencl not reset and regionfuse
alibaba · Aug 27, 2024 · 9f38e39 · 9f38e39
1 parent 5e93be1
commit 9f38e39
Show file tree

Hide file tree

Showing 8 changed files with 219 additions and 74 deletions.
diff --git a/source/backend/cpu/CPURaster.cpp b/source/backend/cpu/CPURaster.cpp
@@ -20,6 +20,32 @@
 
 using Vec4 = MNN::Math::Vec<float, 4>;
 namespace MNN {
+struct ReduceInfo {
+ int reduceMask[3] = {0, 0, 0};
+ int reduceNum = 0;
+ int reduceIndex[3];
+ int normalIndex[3];
+ int normalNum = 0;
+ bool compute(const Tensor::InsideDescribe::Region& slice) {
+ normalNum = 0;
+ reduceNum = 0;
+ for (int i=0; i<3; ++i) {
+ if (slice.size[i] > 1 && slice.dst.stride[i] == 0) {
+ reduceMask[i] = 1;
+ reduceIndex[reduceNum] = i;
+ reduceNum ++;
+ } else {
+ MNN_ASSERT(normalNum < 3);
+ normalIndex[normalNum] = i;
+ normalNum++;
+ }
+ }
+ if (0 == reduceNum) {
+ return false;
+ }
+ return true;
+ }
+};
 
 ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std::vector<Tensor *> &outputs) {
  MNN_ASSERT(outputs.size() == 1);
@@ -138,7 +164,6 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
  }
  }
  auto cache = static_cast<CPUBackend*>(backend())->getCache();
-#if 1
  auto tempTensor = cache->findCacheTensor(origin, midFormat);
  //MNN_ASSERT(CPUBackend::getBytes(backend(), origin) == 4);
  if (nullptr == tempTensor) {
@@ -160,22 +185,6 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
  if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) {
  forRelease.emplace_back(tempTensor);
  }
-#else
- std::shared_ptr<Tensor> newTensor(new Tensor);
- TensorUtils::copyShape(origin, newTensor.get());
- TensorUtils::getDescribe(newTensor.get())->dimensionFormat = midFormat;
- TensorUtils::getDescribe(newTensor.get())->quantAttr = TensorUtils::getDescribe(origin)->quantAttr;
- newTensor->buffer().type = origin->getType();
- TensorUtils::setLinearLayout(newTensor.get());
- mTempInput.insert(std::make_pair(origin, newTensor.get()));
- auto res = backend()->onAcquireBuffer(newTensor.get(), Backend::DYNAMIC);
- if (!res) {
- return OUT_OF_MEMORY;
- }
- auto tempTensor = newTensor.get();
- backend()->onReleaseBuffer(tempTensor, Backend::DYNAMIC);
- cache->pushCacheTensor(newTensor, origin, midFormat);
-#endif
  mTempInputCopy.emplace_back(std::make_pair(tempTensor, &slice));
  }
  for (auto t : forRelease) {
@@ -185,7 +194,15 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
  backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
  }
  auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
- if (mTempInputCopy.size() == 1 && threadNumber > 1) {
+ mHasReduce = false;
+ ReduceInfo reduceInfo;
+ for (auto& iter : mTempInputCopy) {
+ if (reduceInfo.compute(*iter.second)) {
+ mHasReduce = true;
+ break;
+ }
+ }
+ if (mTempInputCopy.size() == 1 && threadNumber > 1 && (!mHasReduce)) {
  // Split to multi region
  auto region = mTempInputCopy[0].second;
  const int thredHold = 100;//TODO: Find better way to determine it
@@ -396,25 +413,11 @@ static void _zero(const Tensor::InsideDescribe::Region& slice, int bytes, uint8_
  }
 }
 static bool _reduceblit(const Tensor::InsideDescribe::Region& slice, int bytes, const uint8_t* srcPtr, uint8_t* dstPtr) {
- int reduceMask[3] = {0, 0, 0};
- int reduceNum = 0;
- int reduceIndex[3];
- int normalIndex[3];
- int normalNum = 0;
- for (int i=0; i<3; ++i) {
- if (slice.size[i] > 1 && slice.dst.stride[i] == 0) {
- reduceMask[i] = 1;
- reduceIndex[reduceNum] = i;
- reduceNum ++;
- } else {
- normalIndex[normalNum] = i;
- normalNum++;
- }
- }
- if (0 == reduceNum) {
- return false;
- }
- switch (reduceNum) {
+ ReduceInfo reduceInfo;
+ reduceInfo.compute(slice);
+ auto normalIndex = reduceInfo.normalIndex;
+ auto reduceIndex = reduceInfo.reduceIndex;
+ switch (reduceInfo.reduceNum) {
  case 3:
  {
  float summer = 0.0f;
@@ -490,14 +493,13 @@ static bool _reduceblit(const Tensor::InsideDescribe::Region& slice, int bytes,
  return false;
 }
 
-static void _blit(const Tensor::InsideDescribe::Region& slice, int bytes, const uint8_t* srcPtr, uint8_t* dstPtr) {
+static void _blit(const Tensor::InsideDescribe::Region& slice, int bytes, const uint8_t* srcPtr, uint8_t* dstPtr, bool hasReduce) {
  auto proc = _selectUnitProc(bytes, slice.src.stride[2], slice.dst.stride[2]);
-#define MNN_BLIT_SUPPORT_REDUCE
-#ifdef MNN_BLIT_SUPPORT_REDUCE
- if (_reduceblit(slice, bytes, srcPtr, dstPtr)) {
- return;
+ if (hasReduce) {
+ if (_reduceblit(slice, bytes, srcPtr, dstPtr)) {
+  return;
+ }
  }
-#endif
  if (slice.src.stride[1] == slice.size[2] && slice.dst.stride[1] == slice.size[2] && slice.src.stride[2] == 1) {
  for (int z=0; z<slice.size[0]; ++z) {
  auto srcZ = srcPtr + z * slice.src.stride[0] * bytes;
@@ -624,13 +626,17 @@ ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &____inputs, const st
  tensorConvert(iter.first, iter.second, bytes);
  }
  threadNum = ALIMIN(threadNum, (int)mTempInputCopy.size());
+ if (mHasReduce) {
+ // Don't support reduce with multi thread now
+ threadNum = 1;
+ }
  MNN_CONCURRENCY_BEGIN(tId, threadNum) {
  for (int u=tId; u<mTempInputCopy.size(); u+=threadNum) {
  auto& iter = mTempInputCopy[u];
  auto& slice = *(iter.second);
  auto srcPtr = iter.first->host<uint8_t>() + slice.src.offset * bytes;
  auto dstPtr = (uint8_t*)mOutputPtr + slice.dst.offset * bytes;
- _blit(slice, bytes, srcPtr, dstPtr);
+ _blit(slice, bytes, srcPtr, dstPtr, mHasReduce);
  }
  }
  MNN_CONCURRENCY_END();
@@ -807,7 +813,7 @@ class CPULoop : public Execution {
  if (halide_type_float == input->getType().code) {
  bytes = cpubackend->functions()->bytes;
  }
- _blit(reg, bytes, input->host<uint8_t>(), output->host<uint8_t>());
+ _blit(reg, bytes, input->host<uint8_t>(), output->host<uint8_t>(), false);
  }
 
  }
@@ -855,7 +861,7 @@ class CPULoop : public Execution {
  auto dstOffset = dstIter * step0 + dstView->offset();
  if (dstOffset >= 0) {
  if (srcOffset >= 0 && srcOffset < inputSize) {
- _blit(reg, bytes, input->host<uint8_t>() + bytes * srcOffset, output->host<uint8_t>() + bytes * dstOffset);
+ _blit(reg, bytes, input->host<uint8_t>() + bytes * srcOffset, output->host<uint8_t>() + bytes * dstOffset, false);
  } else {
  _zero(reg, bytes, output->host<uint8_t>() + bytes * dstOffset);
  }
@@ -921,7 +927,7 @@ class CPULoop : public Execution {
  auto step0 = cmd->steps()->data()[0];
  auto step1 = cmd->steps()->data()[1];
  auto loopNumber = mLoop->loopNumber();
- _blit(reg, bytes, (const uint8_t*)src, (uint8_t*)dst);
+ _blit(reg, bytes, (const uint8_t*)src, (uint8_t*)dst, false);
  break;
  }
  auto proc = static_cast<CPUBackend*>(backend())->functions()->MNNSelectUnaryFunctionForFloat(op->main_as_UnaryOp()->opType(), static_cast<CPUBackend*>(backend())->precisionMode());

diff --git a/source/backend/cpu/CPURaster.hpp b/source/backend/cpu/CPURaster.hpp
@@ -36,6 +36,7 @@ class CPURaster : public Execution {
  OpCommonUtils::TensorConvertParameter mSingleConvert;
  std::vector<std::shared_ptr<Tensor::InsideDescribe::Region>> mCacheRegions;
  int32_t mZeroPoint = 0;
+ bool mHasReduce = false;
 };
 }
 #endif
diff --git a/source/backend/opencl/core/OpenCLBackend.cpp b/source/backend/opencl/core/OpenCLBackend.cpp
@@ -163,6 +163,9 @@ void CLRuntime::onMaskOpReady(const std::vector<Tensor*>& inputs, const std::vec
  dstInfo->mInfos.emplace_back(std::move(opInfo));
  }
 }
+void CLRuntime::onReset(int numberThread, const BackendConfig* config, bool full) {
+ mOpenCLRuntime->setGpuMode(numberThread);
+}
 
 bool CLRuntime::onSetCache(const void* buffer, size_t size) {
  if (nullptr == buffer) {

diff --git a/source/backend/opencl/core/OpenCLBackend.hpp b/source/backend/opencl/core/OpenCLBackend.hpp
@@ -49,6 +49,7 @@ class CLRuntime : public Runtime {
  virtual ~CLRuntime();
 
  virtual Backend* onCreate(const BackendConfig* config) const override;
+ virtual void onReset(int numberThread, const BackendConfig* config, bool full) override;
  virtual void onGabageCollect(int level) override;
  virtual float onGetMemoryInMB() override;
  virtual std::pair<const void*, size_t> onGetCache() override;

diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.hpp b/source/backend/opencl/core/runtime/OpenCLRuntime.hpp
@@ -174,11 +174,11 @@ class OpenCLRuntime {
 
  std::pair<const void*, size_t> makeCache(void* tuneInfo);
  bool setCache(std::pair<const void*, size_t> cache);
+ void setGpuMode(const int cl_mode_num);
 private:
  bool loadProgram(const std::string &programName, cl::Program *program);
  bool buildProgram(const std::string &buildOptionsStr, cl::Program *program);
  bool getDeviceSupportsExtension(const cl::Device &device, const char *extensionName);
- void setGpuMode(const int cl_mode_num);
 
 private:
  std::vector<size_t> mMaxImageSize;

diff --git a/source/core/TensorUtils.cpp b/source/core/TensorUtils.cpp
@@ -544,15 +544,30 @@ static bool _ClipDst(int* stride, int srcOffset, int dstOffset, const int* srcSi
  dstMin[i] = ALIMAX(0, -o[i]);
  dstMax[i] = ALIMIN(srcSize[i]-o[i], dstSize[i]);
  }
+ int srcMin = -1;
+ for (int i=0; i<sizeNum; ++i) {
+ if (dstMax[i] < srcSize[i]) {
+ if (srcMin == -1) {
+ srcMin = stride[i];
+ } else {
+ srcMin = ALIMIN(stride[i], srcMin);
+ }
+ }
+ }
+ if (srcMin < 0) {
+ // Src is fully used
+ return true;
+ }
+
  // Check If dstMax is inside src, it means one region can't describe dst - src
  // TODO: Support slice region to support fuse
  for (int i=0; i<sizeNum; ++i) {
  if (dstMax[i] == dstSize[i]) {
  continue;
  }
  int bias = offsetBias + dstMax[i] * stride[i];
- if (bias < srcMax) {
- // for [dstMax, dstSize], exist value match formula
+ if (bias < srcMax && bias >= srcMin) {
+ // for [dstMax, dstSize], may exist value match formula
  return false;
  }
  }
@@ -578,7 +593,6 @@ class TensorUtils::FuseRegionStatus {
 public:
  enum Status {
  FUSE_SRC_COPY,
- FUSE_DST_COPY,
  FUSE_REGION_COMPUTE
  };
  void apply(const Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg) {
@@ -587,16 +601,6 @@ class TensorUtils::FuseRegionStatus {
  dstReg.origin = srcReg.origin;
  dstReg.src.offset += srcReg.src.offset - srcReg.dst.offset;
  break;
- case FUSE_DST_COPY:
- dstReg.origin = srcReg.origin;
- dstReg.dst = srcReg.dst;
- dstReg.src = srcReg.src;
- dstReg.src.offset = mSrcOff;
- dstReg.dst.offset = mDstOff;
- dstReg.size[0] = srcReg.size[0];
- dstReg.size[1] = srcReg.size[1];
- dstReg.size[2] = srcReg.size[2];
- break;
  case FUSE_REGION_COMPUTE:
  {
  if (dstSize[0] == 0) {
@@ -612,7 +616,7 @@ class TensorUtils::FuseRegionStatus {
  int valid[3] = {0, 0, 0};
  int offset = 3 - dstNum;
  if (dstNum > sizeNum) {
- for (int i = 2; i >= 0; i--) {
+ for (int i = dstNum - 1; i >= 0; i--) {
  if (i < dstNum) {
  if (dstSize[i] == 1) {
  expandIdx = i;
@@ -691,17 +695,6 @@ class TensorUtils::FuseRegionStatus {
  mStatus = FUSE_SRC_COPY;
  return true;
  }
- // dst copy fuse
- if (isCopyRegion(dstReg) && dstTotalSize == srcTotalSize && copyValid) {
- mSrcOff = dstReg.src.offset - srcReg.dst.offset;
- mDstOff = dstReg.dst.offset;
- mSrcOff = offsetCompute(srcReg, dstReg.src.offset, srcReg.dst.offset, true) + srcReg.src.offset;
- if (!(srcReg.src.stride[2] > 0 && mSrcOff % srcReg.src.stride[2] != 0)) {
- // when transpose + slice, offset is not align can't fuse
- mStatus = FUSE_DST_COPY;
- return true;
- }
- }
  #define MNN_3_INT_INIT(x, y) { x[0] = y; x[1] = y; x[2] = y; }
  MNN_3_INT_INIT(dstStride, -1)
  MNN_3_INT_INIT(srcStride, -1)

diff --git a/test/core/RegionFuse.cpp b/test/core/RegionFuse.cpp
@@ -17,7 +17,7 @@ class RegionFuseTest : public MNNTestCase {
  using Region = Tensor::InsideDescribe::Region;
  virtual ~RegionFuseTest() = default;
  virtual bool run(int precision) {
- constexpr int N = 11;
+ constexpr int N = 12;
  // [src_offset, src_stride_0_1_2, dst_offset, dst_stride_0_1_2, size_0_1_2]
  int data[N*3][11] = {
  // 2D-transpose + 2D-transpose = memcpy: [1, 4, 16] => [1, 16, 4] => [1, 4, 16]
@@ -64,6 +64,10 @@ class RegionFuseTest : public MNNTestCase {
  {0, 12321, 111, 1, 0, 12544, 112, 1, 32, 111, 111},
  {113, 12544, 112, 1, 0, 12321, 111, 1, 32, 111, 111},
  {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ // concat + stack
+ {0, 32, 1, 1, 32, 64, 1, 1, 20, 32, 1},
+ {0, 0, 1280, 1, 0, 1, 640, 1, 1, 1, 640},
+ {0, 0, 32, 1, 32, 0, 64, 1, 1, 10, 32},
  };
  TensorUtils::FuseWrap fuseUtils;
  for (int i = 0; i < N; i++) {