Merge pull request #2774 from wangzhaode/feature/sync_2.8.2

[MNN:Sync] Sync Internal 2.8.2
alibaba · Feb 29, 2024 · 7eddd45 · 7eddd45
2 parents 5607201 + 970b63f
commit 7eddd45
Show file tree

Hide file tree

Showing 295 changed files with 1,013,758 additions and 3,667 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -44,6 +44,7 @@ option(MNN_DEBUG_MEMORY "MNN Debug Memory Access" OFF)
 option(MNN_DEBUG_TENSOR_SIZE "Enable Tensor Size" OFF)
 option(MNN_GPU_TRACE "Enable MNN Gpu Debug" OFF)
 option(MNN_SUPPORT_RENDER "Enable MNN Render Ops" OFF)
+option(MNN_SUPPORT_TRANSFORMER_FUSE "Enable MNN transformer Fuse Ops" OFF)
 option(MNN_PORTABLE_BUILD "Link the static version of third party libraries where possible to improve the portability of built executables" OFF)
 option(MNN_SEP_BUILD "Build MNN Backends and expression separately. Only works with MNN_BUILD_SHARED_LIBS=ON" ON)
 option(NATIVE_LIBRARY_OUTPUT "Native Library Path" OFF)
@@ -166,7 +167,9 @@ endif()
 if(MNN_SUPPORT_RENDER)
  add_definitions(-DMNN_SUPPORT_RENDER)
 endif()
-
+if(MNN_SUPPORT_TRANSFORMER_FUSE)
+ add_definitions(-DMNN_SUPPORT_TRANSFORMER_FUSE)
+endif()
 # debug options
 if(MNN_DEBUG_MEMORY)
  add_definitions(-DMNN_DEBUG_MEMORY)

diff --git a/docker_release.sh b/docker_release.sh
@@ -0,0 +1,6 @@
+# using docker run release
+docker start mnn_release
+docker exec -i -e TEST_ID=$(pwd | awk -F "/" '{print $(NF-1)}') mnn_release bash <<'EOF'
+cd ~/yanxing_zhaode/cise/space/$TEST_ID/source && ./release.sh pymnn
+exit
+EOF
diff --git a/docs/compile/cmake.md b/docs/compile/cmake.md
@@ -81,4 +81,5 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_VULKAN_IMAGE | 构建MNN的Vulkan后端时采用Image内存模式，以便支持FP16和部分移动端上GPU的加速，默认为`ON` |
 | MNN_LOW_MEMORY | 是否支持低内存模式，支持低内存模式使用权值量化模型并设置`low_memory`则会使用计算时反量化，默认为`OFF` |
 | MNN_SUPPORT_RENDER | 是否支持图形渲染相关算子实现，默认为 `OFF` |
+| MNN_SUPPORT_TRANSFORMER_FUSE | 是否支持Fuse Transformer相关OP实现，默认为 `OFF` |
 | MNN_BUILD_LLM | 是否构建基于MNN的llm库和demo，默认为`OFF` |
diff --git a/docs/faq.md b/docs/faq.md
@@ -246,7 +246,7 @@ GPU 后端调用 copy 的时间包含两个部分
  - x86 / x64 架构下，无 vnni 指令，量化计算需要先从 int8 转到 int16 ，乘加到 int32 ，本身计算效率不如浮点直接乘加到 fp32 上快。
  - x64 + vnni 指令，量化计算有 sdot 指令，明显快于 FP32 ，编译 MNN 时需要开启 MNN_AVX512 以支持这个指令，一般相比 AVX512 的浮点运算快 30%
  - ARM v7a / ARMv8 ：量化计算采用 int8 乘加到 int16，再双加到 int32 的方式，计算效率略快于浮点（一般 30% 左右提升）。
- - ARMv8.2 + 量化计算有 sdot 指令，但同时 FP32 相对之前架构发射数也提升了一倍，编译 MNN 打开 MNN_ARM82 启用 sdot 指令则量化计算更快，否则 FP32 更快，理想情况比 FP32 快1倍以上，比 FP16 快 20%。
+ - ARMv8.2 架构有 sdot 指令，但同时 FP32 相对之前架构发射数也提升了一倍，也支持了比 FP32 快一倍的 FP16 向量计算指令，MNN 会检查设备架构以开启 sdot / smmla ，理想情况下量化计算性能比 FP32 快1倍以上，比 FP16 快 20%。
 
 ## 其他问题
 ### MNN模型如何加密
@@ -256,4 +256,4 @@ GPU 后端调用 copy 的时间包含两个部分
 2. 执行`schema/generate.sh`重新生成`flatbuffers`头文件；
 3. 重新编译`MNN`库文件， `Convert`等所有工具；
 4. 使用新的工具重新转换模型；
-5. 在端侧使用新模型和新的`MNN`库文件进行部署；
+5. 在端侧使用新模型和新的`MNN`库文件进行部署；
diff --git a/docs/inference/module.md b/docs/inference/module.md
@@ -45,9 +45,10 @@ rtmgr->setCache(".cachefile");
 // 从模型文件加载并创建新Module
 const std::string model_file = "/tmp/mymodule.mnn"; // model file with path
 
-// 输入名，可以为空，为空时 MNN 自动搜索模型中的输入，多输入情况下无法保证顺序，需要通过 getInfo 接口查看
+// 输入名：多个输入时按顺序填入，其顺序与后续 onForward 中的输入数组需要保持一致
 const std::vector<std::string> input_names{"input_1", "input_2", "input_3"};
-// 输出名，可以为空，为空时 MNN 自动搜索模型中的输出，多输出情况下无法保证顺序，需要通过 getInfo 接口查看
+
+// 输出名，多个输出按顺序填入，其顺序决定 onForward 的输出数组顺序
 const std::vector<std::string> output_names{"output_1"};
 
 Module::Config mdconfig; // default module config
@@ -56,6 +57,8 @@ std::unique_ptr<Module> module; // module
 module.reset(Module::load(input_names, output_names, model_filename.c_str(), rtMgr, &mdconfig));
 ```
 
+输入输出的名字可以为空，此时，MNN 会检索模型中的输入/输出填入，在多输入/输出情况下无法保证顺序，需要通过 getInfo 接口查看。
+
 ### Module::Config 
 创建`Module`时可传入`Module::Config`，具体结构如下：
 

diff --git a/docs/pymnn/expr.md b/docs/pymnn/expr.md
@@ -190,6 +190,62 @@ array([0., 1., 2., 3.], dtype=float32)
 ```python
 >>> expr.set_global_executor_config(2, 2, 1)
 ```
+
+---
+### `sync()`
+MNN VARP同步，调用后可以保证改VARP计算完毕
+
+返回：`None`
+
+返回类型：`None`
+
+示例：
+
+```python
+>>> mnn_var = expr.placeholder([2,2])
+>>> mnn_var.sync()
+```
+
+---
+### `set_device_ptr(device_ptr, memory_type)`
+设置MNN VARP GPU内存地址，同时指定给定内存地址对应的内存类型(CUDA/OPENCL/OPENGL等)，仅在MNN VARP有GPU内存时可用：
+
+参数：
+- `device_ptr:uint64_t` 整形内存指针地址
+- `memory_type:int` 例如： 2->CUDA 3->OpenCL等, 详见include/MNN/MNNForwardType.h文件中MNNForwardType结构体
+
+返回：`None`
+
+返回类型：`None`
+
+示例：
+
+```python
+>>> torch_tensor = torch.empty([1, 1000], dtype=torch.float16).cuda()
+>>> mnn_var = expr.placeholder([2,2])
+>>> mnn_var.set_device_ptr(torch_tensor.data_ptr() ,2)
+```
+
+---
+### `copy_to_device_ptr(device_ptr, memory_type)`
+拷贝MNN VARP GPU内存到指定内存地址, 同时指定给定内存地址对应的内存类型(CUDA/OPENCL/OPENGL等)：
+
+参数：
+- `device_ptr:uint64_t` 整形内存指针地址
+- `memory_type:int` 例如： 2->CUDA 3->OpenCL等, 详见include/MNN/MNNForwardType.h文件中MNNForwardType结构体
+
+返回：`None`
+
+返回类型：`None`
+
+示例：
+
+```python
+>>> torch_tensor = torch.empty([1, 1000], dtype=torch.float16).cuda()
+>>> mnn_var = expr.placeholder([2,2])
+>>> mnn_var.copy_to_device_ptr(torch_tensor.data_ptr() ,2)
+```
+
 ---
 ### `sign(x)`
 返回输入值的符号，正数返回1，负数返回-1

diff --git a/express/Executor.cpp b/express/Executor.cpp
@@ -58,6 +58,10 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
  mAttr->firstType = std::make_pair(type, numberThread);
  info.mode = Backend::Info::DIRECT;
  info.numThread = numberThread;
+ if (MNN_FORWARD_METAL == type) {
+ // Close metal's defer encoder
+ info.numThread |= MNN_GPU_RECORD_OP;
+ }
  info.user = (BackendConfig*)&config;
  std::shared_ptr<Runtime> bn(creator->onCreate(info));
  mRuntimes[mAttr->firstType] = bn;
@@ -257,6 +261,9 @@ void Executor::RuntimeManager::setHint(Interpreter::HintMode mode, int value) {
  case Interpreter::STRICT_CHECK_MODEL:
  mInside->checkNetBuffer = value > 0;
  break;
+ case Interpreter::MEM_ALLOCATOR_TYPE:
+ mInside->modes.memoryAllocatorType = value;
+ break;
  default:
  break;
  }
@@ -538,7 +545,7 @@ void Executor::_makeCache(const std::vector<EXPRP>& expr, bool forceCPU) {
  quant->scale = TensorUtils::getDescribe(srcTensor)->quantAttr.get()->scale;
  quant->zero = TensorUtils::getDescribe(srcTensor)->quantAttr.get()->zero;
  }
- 
+
  TensorUtils::getDescribe(tensor.get())->index = (int)scheduleInfo.allTensors.size();
  scheduleInfo.allTensors.emplace_back(tensor);
  }

diff --git a/express/Expr.cpp b/express/Expr.cpp
@@ -545,6 +545,48 @@ void Variable::setName(const std::string& name) {
  mFrom->setName(name);
  }
 }
+
+bool Variable::setDevicePtr(const void* devicePtr, int memoryType) {
+ if (nullptr != mFrom->get()) {
+ MNN_ERROR("Can't setDevicePtr to no-input op\n");
+ return false;
+ }
+ informDirty();
+ MNN_ASSERT(TensorUtils::getDescribe(mFrom->inside()->mOutputTensors[0])->quantAttr == nullptr || TensorUtils::getDescribe(mFrom->inside()->mOutputTensors[0])->type == DataType_DT_FLOAT);
+ mFrom->mInside->mContentDirty = false;
+ // Clear host address, Don't malloc hostPtr afterwards
+ Utils::releaseMemoryForHostTensor(mFrom->inside()->mOutputTensors[0]);
+ return mFrom->inside()->mOutputTensors[0]->setDevicePtr(devicePtr, memoryType);
+}
+
+bool Variable::copyToDevicePtr(void* devicePtr, int memoryType) {
+ if (nullptr != mFrom->get()) {
+ MNN_ERROR("Can't copyToDevicePtr to no-input op\n");
+ return false;
+ }
+
+ auto inside = mFrom->inside();
+ auto originTensor = inside->mOutputTensors[mFromIndex];
+
+ auto bn = TensorUtils::getDescribe(originTensor)->getBackend();
+ if(bn == nullptr) {
+ MNN_ERROR("Error: Varp copyToDevicePtr can't find backend\n");
+ return false;
+ }
+ if (bn->type() != memoryType) {
+ MNN_ERROR("Error: VARP backend type ( %d ), is not same as assigned memory type ( %d )\n", bn->type(), memoryType);
+ return false;
+ }
+
+ MNN::Tensor tempTensor(originTensor->dimensions(), originTensor->getDimensionType());
+ tempTensor.buffer().device = (uint64_t)devicePtr;
+
+ TensorUtils::getDescribe(originTensor)->getBackend()->onCopyBuffer(originTensor, &tempTensor);
+ // Sync the result
+ tempTensor.wait(Tensor::MAP_TENSOR_READ, true);
+ return true;
+}
+
 const std::string& Variable::name() const {
  return mFrom->outputName(mFromIndex);
 }

diff --git a/express/Utils.cpp b/express/Utils.cpp
@@ -116,10 +116,7 @@ bool Utils::allocMemoryForHostTensor(Tensor* dest) {
  if (TensorUtils::getDescribe(dest)->memoryType != Tensor::InsideDescribe::MEMORY_HOST) {
  return false;
  }
- auto size = dest->size();
- if (0 >= size) {
- return false;
- }
+ auto size = dest->usize();
  dest->buffer().host = (uint8_t*)MNNMemoryAllocAlign(size, MNN_MEMORY_ALIGN_DEFAULT);
  return dest->buffer().host != nullptr;
 }

diff --git a/express/module/PipelineModule.cpp b/express/module/PipelineModule.cpp
@@ -335,7 +335,7 @@ static std::vector<int> _collectNeededOps(const MNN::Net* net, const std::set<in
  // 0: use, 1: no use
  std::vector<int> opMask(net->oplists()->size());
  ::memset(opMask.data(), 0, opMask.size() * sizeof(int));
- 
+
  // Set Initial Status
  for (auto v : outputIndexes) {
  tensorMask[v] = 1;
@@ -638,6 +638,9 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
  modRuntime.rt.first.begin()->second->setExternalFile(rtMgr->getInside()->mExternalFile);
  modRuntime.rt.second->setExternalFile(rtMgr->getInside()->mExternalFile);
  }
+ // set allocator type
+ modRuntime.rt.first.begin()->second->setAllocatorType(rtMgr->getInside()->modes.memoryAllocatorType);
+ modRuntime.rt.second->setAllocatorType(rtMgr->getInside()->modes.memoryAllocatorType);
  }
  auto& rt = modRuntime.rt;
  auto firstRt = rt.first[modRuntime.compute.type];

diff --git a/include/MNN/ImageProcess.hpp b/include/MNN/ImageProcess.hpp
@@ -44,6 +44,7 @@ enum Wrap { CLAMP_TO_EDGE = 0, ZERO = 1, REPEAT = 2 };
  * 2: Sample line and do format convert
  * 3: Turn RGBA to float tensor, and do sub and normalize
  */
+
 class MNN_PUBLIC ImageProcess {
 public:
  struct Inside;
@@ -62,7 +63,6 @@ class MNN_PUBLIC ImageProcess {
  /** edge wrapper */
  Wrap wrap = CLAMP_TO_EDGE;
  };
-
 public:
  /**
  * @brief create image process with given config for given tensor.
@@ -86,10 +86,10 @@ class MNN_PUBLIC ImageProcess {
  static ImageProcess* create(const ImageFormat sourceFormat = RGBA, const ImageFormat destFormat = RGBA,
  const float* means = nullptr, const int meanCount = 0, const float* normals = nullptr,
  const int normalCount = 0, const Tensor* dstTensor = nullptr);
-
+ 
  ~ImageProcess();
  static void destroy(ImageProcess* imageProcess);
-
+ 
  /**
  * @brief get affine transform matrix.
  * @return affine transform matrix.
@@ -98,7 +98,7 @@ class MNN_PUBLIC ImageProcess {
  return mTransform;
  }
  void setMatrix(const Matrix& matrix);
-
+ 
  /**
  * @brief convert source data to given tensor.
  * @param source source data.
@@ -109,7 +109,7 @@ class MNN_PUBLIC ImageProcess {
  * @return result code.
  */
  ErrorCode convert(const uint8_t* source, int iw, int ih, int stride, Tensor* dest);
-
+ 
  /**
  * @brief convert source data to given tensor.
  * @param source source data.
@@ -126,7 +126,7 @@ class MNN_PUBLIC ImageProcess {
  */
  ErrorCode convert(const uint8_t* source, int iw, int ih, int stride, void* dest, int ow, int oh, int outputBpp = 0,
  int outputStride = 0, halide_type_t type = halide_type_of<float>());
-
+ 
  /**
  * @brief create tensor with given data.
  * @param w image width.
@@ -140,7 +140,7 @@ class MNN_PUBLIC ImageProcess {
  return createImageTensor(halide_type_of<T>(), w, h, bpp, p);
  }
  static Tensor* createImageTensor(halide_type_t type, int w, int h, int bpp, void* p = nullptr);
-
+ 
  /**
  * @brief set padding value when wrap=ZERO.
  * @param value padding value.
@@ -149,14 +149,14 @@ class MNN_PUBLIC ImageProcess {
  void setPadding(uint8_t value) {
  mPaddingValue = value;
  }
-
+ 
  /**
  * @brief set to draw mode.
  * @param void
  * @return void.
  */
  void setDraw();
-
+ 
  /**
  * @brief draw color to regions of img.
  * @param img the image to draw.
@@ -179,4 +179,4 @@ class MNN_PUBLIC ImageProcess {
 } // namespace CV
 } // namespace MNN
 
-#endif /* ImageProcess_hpp */
+#endif /* MNN_ImageProcess_hpp */
diff --git a/include/MNN/MNNDefine.h b/include/MNN/MNNDefine.h
@@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 2
 #define MNN_VERSION_MINOR 8
-#define MNN_VERSION_PATCH 1
+#define MNN_VERSION_PATCH 2
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
diff --git a/include/MNN/Tensor.hpp b/include/MNN/Tensor.hpp
@@ -304,6 +304,10 @@ class MNN_PUBLIC Tensor {
  * @param finish wait for command flush or finish
  */
  int wait(MapType mtype, bool finish);
+ /**
+ * @brief set GPU tensor device ptr, and inform memory type
+ */
+ bool setDevicePtr(const void* devicePtr, int memoryType);
 private:
  halide_buffer_t mBuffer;
  struct InsideDescribe* mDescribe;

diff --git a/include/MNN/expr/Expr.hpp b/include/MNN/expr/Expr.hpp
@@ -108,11 +108,14 @@ class MNN_PUBLIC Variable {
  Dimensionformat order = NHWC;
  INTS dim;
  halide_type_t type;
- int size;
+ size_t size;
  void syncSize();
  };
  const std::string& name() const;
  void setName(const std::string& name);
+ bool setDevicePtr(const void* devicePtr, int memoryType);
+ bool copyToDevicePtr(void* devicePtr, int memoryType);
+
  std::pair<EXPRP, int> expr() const {
  return std::make_pair(mFrom, mFromIndex);
  }

diff --git a/project/android/gradle/wrapper/gradle-wrapper.properties b/project/android/gradle/wrapper/gradle-wrapper.properties
@@ -3,4 +3,4 @@ distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-4.6-all.zip
+distributionUrl=http\://mtl-gradle-mirror.oss-cn-hangzhou.aliyuncs.com/gradle-4.6-all.zip