MNN:Sync: Sync Internal 3.0.0

alibaba · Nov 18, 2024 · 5f43789 · 5f43789
1 parent 92d69ae
commit 5f43789
Show file tree

Hide file tree

Showing 214 changed files with 10,688 additions and 6,061 deletions.
diff --git a/.gitignore b/.gitignore
@@ -24,6 +24,7 @@ out/
 .gradle
 .gradle/
 build/
+buildvisionOs/
 
 # Signing files
 .signing/

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -73,7 +73,7 @@ option(MNN_SUPPORT_BF16 "Enable MNN's bf16 op" OFF)
 option(MNN_LOW_MEMORY "Build MNN support low memory for weight quant model." OFF)
 option(MNN_CPU_WEIGHT_DEQUANT_GEMM "Build MNN CPU weight dequant related gemm kernels." OFF)
 
-IF (OHOS)
+IF (OHOS AND MNN_INTERNAL)
   include($ENV{NODE_PATH}/@ali/tcpkg/tcpkg.cmake)
   export_headers(DIR ${CMAKE_SOURCE_DIR}/include/MNN)
   IF (MNN_BUILD_OPENCV)
@@ -209,6 +209,7 @@ option(MNN_VULKAN "Enable Vulkan" OFF)
 option(MNN_ARM82 "Enable ARMv8.2's FP16 Compute" ON)
 option(MNN_KLEIDIAI "Enable KLEIDIAI" OFF)
 option(MNN_ONEDNN "Enable oneDNN" OFF)
+option(MNN_AVX2 "Open AVX2 Compile for x86 if possible" ON)
 option(MNN_AVX512 "Enable AVX512" OFF)
 option(MNN_CUDA "Enable CUDA" OFF)
 option(MNN_TENSORRT "Enable TensorRT" OFF)
@@ -312,6 +313,9 @@ IF(MNN_DEBUG_MEMORY)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address")
 endif()
 
+set(MNN_DEPS "")
+set(MNN_EXTRA_DEPENDS "")
+
 IF(CMAKE_BUILD_TYPE MATCHES Debug)
     add_definitions(-DMNN_DEBUG -DDEBUG)
     if(MSVC)
@@ -337,6 +341,13 @@ else()
         endif()
     endif()
 ENDIF(CMAKE_BUILD_TYPE MATCHES Debug)
+if(OHOS)
+    IF(MNN_USE_LOGCAT)
+        add_definitions(-DMNN_USE_LOGCAT)
+        add_definitions(-Wno-format-security)
+        list(APPEND MNN_EXTRA_DEPENDS libhilog_ndk.z.so)
+    ENDIF()
+endif()
 if(CMAKE_SYSTEM_NAME MATCHES "^Android")
     IF(MNN_USE_LOGCAT)
         add_definitions(-DMNN_USE_LOGCAT)
@@ -456,8 +467,6 @@ IF(MNN_BUILD_LLM)
 ENDIF()
 
 
-set(MNN_DEPS "")
-set(MNN_EXTRA_DEPENDS "")
 
 # Add Thread dependency
 find_package(Threads)
@@ -505,13 +514,11 @@ if (NOT MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math -fno-rtti -fno-exceptions ")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math")
 else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /fp:fast")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:precise")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /fp:precise")
 endif()
 
 # Metal
-set(MNN_DEPS "")
-set(MNN_EXTRA_DEPENDS "")
 list(APPEND MNN_DEPS MNN)
 
 # Plugin
@@ -531,14 +538,10 @@ endif()
 # CoreML
 IF(MNN_COREML)
     add_definitions(-DMNN_COREML_ENABLED=1)
-    add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/coreml/)
+    include(${CMAKE_CURRENT_LIST_DIR}/source/backend/coreml/CMakeLists.txt)
 
-    IF(MNN_SEP_BUILD)
-      list(APPEND MNN_DEPS MNNCoreML)
-      list(APPEND MNN_EXTRA_DEPENDS MNNCoreML)
-    ELSE()
-      list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNCoreML>)
-    ENDIF()
+    list(APPEND MNN_TARGETS MNNCoreML)
+    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNCoreML>)
 
     find_library(COREML CoreML)
     find_library(FOUNDATION Foundation)
@@ -639,7 +642,7 @@ ELSE()
 ENDIF()
 
 # Model Internal. Enable MNN internal features such as model authentication and metrics logging.
-if (MNN_INTERNAL)
+if (MNN_INTERNAL AND NOT OHOS) # TODO: support OHOS logging
     target_compile_options(MNNCore PRIVATE -DMNN_INTERNAL_ENABLED)
     target_compile_options(MNN_Express PRIVATE -DMNN_INTERNAL_ENABLED)
     include(${CMAKE_CURRENT_LIST_DIR}/source/internal/logging/CMakeLists.txt)

diff --git a/README.md b/README.md
@@ -7,6 +7,10 @@
 ## Intro
 MNN is a highly efficient and lightweight deep learning framework. It supports inference and training of deep learning models and has industry-leading performance for inference and training on-device. At present, MNN has been integrated into more than 30 apps of Alibaba Inc, such as Taobao, Tmall, Youku, DingTalk, Xianyu, etc., covering more than 70 usage scenarios such as live broadcast, short video capture, search recommendation, product searching by image, interactive marketing, equity distribution, security risk control. In addition, MNN is also used on embedded devices, such as IoT.
 
+[MNN-LLM](https://github.com/alibaba/MNN/tree/master/transformers/llm) is a large language model runtime solution developed based on the MNN engine. The mission of this project is to deploy LLM models locally on everyone's platforms(Mobile Phone/PC/IOT). It supports popular large language models such as Qianwen, Baichuan, Zhipu, LLAMA, and others. [MNN-LLM User guide](https://mnn-docs.readthedocs.io/en/latest/transformers/llm.html)
+
+[MNN-Diffusion](https://github.com/alibaba/MNN/tree/master/transformers/diffusion) is a stable diffusion model runtime solution developed based on the MNN engine. The mission of this project is to deploy stable diffusion models locally on everyone's platforms. [MNN-Diffusion User guide](https://mnn-docs.readthedocs.io/en/latest/transformers/diffusion.html)
+
 ![architecture](doc/architecture.png)
 
 Inside Alibaba, [MNN](https://mp.weixin.qq.com/s/5I1ISpx8lQqvCS8tGd6EJw) works as the basic module of the compute container in the [Walle](https://mp.weixin.qq.com/s/qpeCETty0BqqNJV9CMJafA) System, the first end-to-end, general-purpose, and large-scale production system for device-cloud collaborative machine learning, which has been published in the top system conference OSDI’22. The key design principles of MNN and the extensive benchmark testing results (vs. TensorFlow, TensorFlow Lite, PyTorch, PyTorch Mobile, TVM) can be found in the OSDI paper. The scripts and instructions for benchmark testing are put in the path “/benchmark”. If MNN or the design of Walle helps your research or production use, please cite our OSDI paper as follows:
@@ -26,7 +30,9 @@ Inside Alibaba, [MNN](https://mp.weixin.qq.com/s/5I1ISpx8lQqvCS8tGd6EJw) works a
 
 
 ## Documentation and Workbench
-MNN's docs are in place in [Yuque docs here](https://www.yuque.com/mnn/en) and [Read the docs](https://mnn-docs.readthedocs.io/en/latest).
+MNN's docs are in place in [Read the docs](https://mnn-docs.readthedocs.io/en/latest).
+
+You can also read docs/README to build docs's html.
 
 MNN Workbench could be downloaded from [MNN's homepage](http://www.mnn.zone), which provides pretrained models, visualized training tools, and one-click deployment of models to devices.
 

diff --git a/README_CN.md b/README_CN.md
@@ -6,6 +6,10 @@
 
 [MNN](https://github.com/alibaba/MNN)是一个轻量级的深度神经网络引擎，支持深度学习的推理与训练。适用于服务器/个人电脑/手机/嵌入式各类设备。目前，MNN已经在阿里巴巴的手机淘宝、手机天猫、优酷等30多个App中使用，覆盖直播、短视频、搜索推荐、商品图像搜索、互动营销、权益发放、安全风控等场景。
 
+[MNN-LLM](https://github.com/alibaba/MNN/tree/master/transformers/llm)是基于MNN引擎开发的大语言模型运行方案，解决大语言模型在本地设备的高效部署问题(手机/个人电脑/嵌入式设备)。支持常见的千问/百川/智谱/LLAMA等大语言模型。使用教程：[MNN-LLM使用教程](https://mnn-docs.readthedocs.io/en/latest/transformers/llm.html)
+
+[MNN-Diffusion](https://github.com/alibaba/MNN/tree/master/transformers/diffusion)是基于MNN引擎开发的Stable Diffusion文生图模型运行方案，解决Stable Diffusion模型在本地设备的高效部署问题。使用教程：[MNN-Diffusion使用教程](https://mnn-docs.readthedocs.io/en/latest/transformers/diffusion.html)
+
 ![架构图](doc/architecture.png)
 
 在阿里巴巴中，[MNN](https://mp.weixin.qq.com/s/5I1ISpx8lQqvCS8tGd6EJw)被用作为[Walle](https://mp.weixin.qq.com/s/qpeCETty0BqqNJV9CMJafA)系统中计算容器的基础模块。Walle是首个端到端、通用型、规模化产业应用的端云协同机器学习系统，发表于操作系统顶会OSDI 2022。Walle的论文中解释了MNN的关键设计理念，并提供了MNN相对于其他深度学习框架（TensorFlow, TensorFlow Lite, PyTorch, PyTorch Mobile, TVM）的benchmark测试结果。相关测试脚本和说明文档被放在“/benchmark”目录下。如果MNN或Walle的设计对你的研究或生产有所助益，欢迎引用我们的OSDI论文：
@@ -26,7 +30,9 @@
 ## 文档与工作台
 MNN文档：
 - [最新文档(readthedocs)](https://mnn-docs.readthedocs.io/en/latest/index.html)
-- [语雀文档](https://www.yuque.com/mnn/cn)
+
+- 也可阅读 docs/README ，编译本地文档
+
 
 [MNN官网](http://www.mnn.zone)上还可以下载MNN团队全新力作MNN工作台，涵盖开箱即用模型、可视化训练等工具，更可以一键部署到多端设备。
 

diff --git a/docs/compile/cmake.md b/docs/compile/cmake.md
@@ -40,7 +40,8 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_VULKAN           | 是否构建`Vulkan`后端，默认为`OFF` |
 | MNN_ARM82            | 编译ARM架构时，是否构建`Armv8.2`后端，以支持FP16计算，默认为`ON` |
 | MNN_ONEDNN           | 是否使用`oneDNN`，默认为`OFF` |
-| MNN_AVX512           | 是否构建`avx512`后端，默认为`OFF` |
+| MNN_AVX2             | 在`MNN_USE_SSE`开启的基础上，是否增加AVX2指令的支持，默认为`ON` |
+| MNN_AVX512           | 在`MNN_USE_SSE`和`MNN_AVX2`开启的基础上，是否增加`avx512`指令集的支持，默认为`OFF` |
 | MNN_CUDA             | 是否构建`Cuda`后端，默认为`OFF` |
 | MNN_CUDA_PROFILE     | 是否打开CUDA profile工具，默认为`OFF` |
 | MNN_CUDA_QUANT       | 是否打开CUDA 量化文件编译，默认为`OFF` |
@@ -85,3 +86,4 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_SUPPORT_TRANSFORMER_FUSE | 是否支持Fuse Transformer相关OP实现，默认为 `OFF` |
 | MNN_BUILD_LLM        | 是否构建基于MNN的llm库和demo，默认为`OFF` |
 | MNN_BUILD_DIFFUSION  | 是否构建基于MNN的diffusion demo，需要打开MNN_BUILD_OPENCV和MNN_IMGCODECS宏使用 默认为`OFF` |
+| MNN_KLEIDIAI         | 是否集成ARM的klediAI加速库【目前处于实验状态，只能跑对称量化的LLM模型】，默认为`OFF` |
diff --git a/docs/compile/engine.md b/docs/compile/engine.md
@@ -1,17 +1,17 @@
 # 主库编译
 默认编译产物为：`libMNN.so`，`express/libMNN_Express.so`
 ## Linux/MacOS
-- 环境要求
+### 环境要求
   - cmake >= 3.10
   - gcc >= 4.9 或者使用 clang
-- 相关编译选项
+### 相关编译选项
   - `MNN_AVX512` 是否使用AVX512指令，需要gcc9以上版本编译
   - `MNN_OPENCL` 是否使用OpenCL后端，针对GPU设备
   - `MNN_METAL` 是否使用Metal后端，针对MacOS/iOSGPU设备
   - `MNN_VULKAN` 是否使用Vulkan后端，针对GPU设备
   - `MNN_CUDA`  是否使用CUDA后端，针对Nivida GPU设备
   - 其他编译选项可自行查看 CMakeLists.txt
-- 具体步骤
+### 具体步骤
   1. 准备工作 (可选，修改 MNN Schema 后需要）
         ```bash
         cd /path/to/MNN
@@ -22,6 +22,15 @@
         ```bash
         mkdir build && cd build && cmake .. && make -j8
         ```
+### Mac M1 上编译
+- Mac M1 较为特殊的一点是作为过渡期间的芯片支持Arm/x64双架构，一般需要额外指定来获取需要的架构
+- 在 cmake 步骤增加 `-DCMAKE_OSX_ARCHITECTURES=arm64` 可以编译出 Arm 架构的库，对应地编译 x64 架构时加 `-DCMAKE_OSX_ARCHITECTURES=x86_64`:
+
+```
+cd /path/to/MNN
+mkdir build && cd build && cmake .. -DCMAKE_OSX_ARCHITECTURES=arm64 && make -j8
+```
+
 ## Windows(非ARM架构)
 - 环境要求
   - Microsoft Visual Studio >= 2017
@@ -87,14 +96,23 @@
         mkdir build_64 && cd build_64 && ../build_64.sh
         ```
 ## iOS
+可基于脚本编译或者基于xcode工程编译
+
 - 环境要求
   - xcode
+  - cmake
 - 相关编译选项
   - `MNN_METAL` 是否使用Metal后端，Metal后端可以利用GPU加速
   - `MNN_COREML`  是否使用CoreML后端，CoreML后端可以利用ANE硬件加速
   - `MNN_ARM82`  是否支持fp16推理，开启该编译选项后，在precision设成Precision_Low时，会在支持的设备（ARMv8.2 及以上架构）上启用低精度(fp16)推理，减少内存占用，提升性能
-- 具体步骤
-  - 在macOS下，用Xcode打开project/ios/MNN.xcodeproj，点击编译即可
+
+- 基于 xcode 编译：用Xcode打开project/ios/MNN.xcodeproj，点击编译即可，工程中默认打开上述所有编译选项
+
+- 基于脚本编译：运行脚本并开启`MNN_ARM82`选项
+```
+sh package_scripts/ios/buildiOS.sh "-DMNN_ARM82=true"
+```
+
 ## 其他平台交叉编译
 由于交叉编译的目标设备及厂商提供的编译环境类型众多，本文恕无法提供手把手教学。 以下是大致流程，请按照具体场景做相应修改。  
 交叉编译大致上分为以下两个步骤，即获取交叉编译器以及配置CMake进行交叉编译。
@@ -137,3 +155,49 @@
         -DCMAKE_CXX_COMPILER=$cross_compile_toolchain/bin/aarch64-linux-gnu-g++
         make -j4
         ```
+
+## Web
+
+- 可以把 MNN 源代码编译为 WebAssembly 以便在浏览器中使用
+
+### 安装 emcc
+参考 https://emscripten.org/docs/getting_started/downloads.html ，安装完成后并激活，此时可使用 emcmake
+
+### 编译（通用）
+- 使用 emcmake cmake 替代 cmake ，然后 make 即可: 
+```
+mkdir build
+cd build
+emcmake cmake .. -DCMAKE_BUILD_TYPE=Release -DMNN_FORBID_MULTI_THREAD=ON -DMNN_USE_THREAD_POOL=OFF -DMNN_USE_SSE=OFF
+emmake make MNN -j16
+```
+
+编译完成后产出 libMNN.a ，可在后续的 webassembly 程序中链接，链接时一般要添加 -s ALLOW_MEMORY_GROWTH=1 ，避免内存不足后 crash
+
+### SIMD 支持
+
+- 如果确认目标设备支持Web Simd ，在cmake时加上 -msimd128 -msse4.1 ，可以较大提升性能，eg: 
+```
+mkdir build
+cd build
+emcmake cmake .. -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_TEST=true -DCMAKE_CXX_FLAGS="-msimd128 -msse4.1" -DMNN_FORBID_MULTI_THREAD=ON -DMNN_USE_THREAD_POOL=OFF -DMNN_USE_SSE=ON
+emmake make MNN -j16
+```
+
+### 测试
+由于Web上文件系统不一致，建议只编译run_test.out运行，其他测试工具需要加上--preload-file {dir} 
+
+- 编译示例
+
+```
+mkdir build
+cd build
+emcmake cmake .. -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_TEST=true -DCMAKE_CXX_FLAGS="-msimd128 -msse4.1 -s ALLOW_MEMORY_GROWTH=1" -DMNN_FORBID_MULTI_THREAD=ON -DMNN_USE_THREAD_POOL=OFF -DMNN_USE_SSE=ON
+emmake make -j16
+```
+
+- 运行
+```
+node run_test.out.js speed/MatMulBConst   //测试性能
+node run_test.out.js  //测试功能
+```
diff --git a/docs/contribute/op.md b/docs/contribute/op.md
@@ -335,33 +335,22 @@ REGISTER_METAL_OP_CREATOR(MetalMyCustomOpCreator, OpType_MyCustomOp);
 重新运行一下 CMake ，或者手动在Xcode工程中新加文件
 
 ### 添加Vulkan实现
-1. 添加Shader
-在`source/backend/vulkan/execution/glsl`目录下添加具体的shader(*.comp)。若输入内存布局为`NC4HW4`，则按`image`实现，否则采用buffer实现。可以参考目录下已有实现。然后，执行`makeshader.py`脚本编译Shader。
-
-2. 实现类声明
-在目录`source/backend/vulkan/execution/`下添加`VulkanMyCustomOp.hpp`和`VulkanMyCustomOp.cpp`：
-```cpp
-class VulkanMyCustomOp : public VulkanBasicExecution {
-public:
-    VulkanMyCustomOp(const Op* op, Backend* bn);
-    virtual ~VulkanMyCustomOp();
-    ErrorCode onEncode(const std::vector<Tensor*>& inputs, 
-                       const std::vector<Tensor*>& outputs,
-                       const VulkanCommandPool::Buffer* cmdBuffer) override;
-private:
-    // GPU Shader所需的参数
-    std::shared_ptr<VulkanBuffer> mConstBuffer;
-    // Pipeline
-    const VulkanPipeline* mPipeline;
-    // Layout Descriptor Set
-    std::shared_ptr<VulkanPipeline::DescriptorSet> mDescriptorSet;
-};
-```
-
-3. 实现
-实现函数`onEncode`，首先需要做内存布局检查：若为`NC4HW4`，则Shader用image实现，否则用buffer。执行完毕返回NO_ERROR。
-
-4. 注册实现类
+Vulkan后端当前包含两种张量存储类型：buffer与image。开发者可在编译时通过宏`MNN_VULKAN_IMAGE`自行选择需要的存储类型。当开发者需要为Vulkan后端添加算子时，亦需要考虑选择何种存储类型并在相应目录下进行开发。下以image类型为例，阐述为Vulkan后端添加算子的主要流程。
+
+1. 实现Execution
+- 执行脚本`source/backend/vulkan/image/compiler/VulkanCodeGen.py`，该脚本将向`source/backend/vulkan/image/execution`中添加`VulkanMyOp.hpp`与`VulkanMyOp.cpp`的模版代码
+- 实现构造函数
+  - 从CPU中读取常量参数，并写入GPU中
+  - 创建算子所需的pipeline
+    - 确定要使用的shader以及Macro
+    - set descriptorTypes，即确定shader中用到的显存对象的类型
+    - 调用getPipeline接口
+- 实现onEncode
+  - 显存资源申请并更新descriptorSet，将shader中需要读写的显存对象写入descriptorSet
+  - 添加memoryBarrier
+  - 把pipeline绑到cmdBuffer与descriptorSet
+  - command dispatch
+- 注册算子并添加创建类
 ```cpp
 class VulkanMyCustomOpCreator : public VulkanBackend::Creator {
 public:
@@ -377,6 +366,15 @@ static bool gResistor = []() {
 }();
 ```
 
+2. 实现shader及编译
+- 编写Compute Shader文件`myOp.comp`，添加至目录`source/backend/vulkan/image/execution/glsl`
+- 将算子中用到的宏加入`source/backend/vulkan/image/execution/glsl/macro.json`
+- 执行脚本`source/backend/vulkan/image/compiler/makeshader.py`，该脚本将编译`myOp.comp`，并更新`source/backend/vulkan/image/compiler/AllShader.cpp`、`source/backend/vulkan/image/shaders/AllShader.h`以及`source/backend/vulkan/image/compiler/VulkanShaderMap.cpp`
+> MNN Vulkan当前使用glslangValidator（glslang仓库地址：<https://github.com/KhronosGroup/glslang>，版本号：12.2.0，commit id：d1517d64cfca91f573af1bf7341dc3a5113349c0）编译所有的compute shader。开发者如需保持自行编译后得到的二进制编译结果与MNN仓库中现有的编译结果一致，需要确保环境中的glslang的版本与MNN所使用的一致。
+
+
+
+
 ### 添加OpenCL实现
 1. 添加Kernel
 在`source/backend/opencl/execution/cl`目录添加具体的kernel(*.cl)。目前feature map均使用`image2d`实现。可以参考目录下已有实现。然后执行`opencl_codegen.py`来生成kernel映射。