From 65ec0ea4062cdb99ec68ebab2abf6747fd7b103b Mon Sep 17 00:00:00 2001
From: xiaying <xiaotang.jxt@alibaba-inc.com>
Date: Sat, 15 Jun 2024 15:39:59 +0800
Subject: [PATCH] MNN:Sync: Fix bug for llama2/llama3 attention fuse, refract
 llm usage

---
 CMakeLists.txt                                |    2 +
 docs/compile/engine.md                        |    6 +-
 docs/compile/{tools.md => other.md}           |   24 +-
 docs/index.rst                                |   10 +-
 docs/transformers/diffusion.md                |    3 +
 docs/transformers/llm.md                      |  198 +++
 express/Executor.cpp                          |   32 +-
 express/Expr.cpp                              |    5 +-
 express/RuntimeAttr.hpp                       |    1 -
 express/module/Module.cpp                     |    2 +-
 include/MNN/Interpreter.hpp                   |   21 +
 pymnn/src/util.h                              |   48 +-
 .../arm82/asm/arm64/MNNPackedMatMulFP16.S     |    2 +-
 .../low_memory/MNNPackedMatMulFP16_int4.S     |  352 ++--
 .../low_memory/MNNPackedMatMulFP16_int8.S     |  361 ++---
 .../MNNPackedMatMulRemainFP16_int4.S          |  183 ++-
 .../MNNPackedMatMulRemainFP16_int8.S          |  178 +-
 source/backend/cpu/CMakeLists.txt             |    2 -
 source/backend/cpu/CPUAttention.cpp           |  204 ++-
 source/backend/cpu/CPUAttention.hpp           |   43 +-
 source/backend/cpu/CPURaster.cpp              |    8 +-
 .../low_memory/MNNPackedMatMulRemain_int4.S   |   98 +-
 .../low_memory/MNNPackedMatMulRemain_int8.S   |   69 +-
 .../arm64/low_memory/MNNPackedMatMul_int4.S   |   60 +-
 .../arm64/low_memory/MNNPackedMatMul_int8.S   |   53 +-
 .../backend/cpu/compute/CommonOptFunction.cpp |   42 +-
 .../backend/cpu/compute/ConvolutionHybrid.cpp |   30 +-
 .../compute/DenseConvolutionTiledExecutor.cpp |  177 +-
 .../cpu/compute/StrassenMatmulComputor.cpp    |    3 +-
 source/backend/cpu/x86_x64/avx/GemmAVX2.cpp   |   16 +-
 .../backend/cpu/x86_x64/avx/GemmFunction.hpp  |  928 +++++++++--
 source/backend/cpu/x86_x64/sse/GemmCommon.hpp |   21 +
 .../backend/cpu/x86_x64/sse/GemmFunction.hpp  |  135 +-
 source/backend/cpu/x86_x64/sse/GemmSSE.cpp    |   16 +-
 source/backend/metal/AllShader.cpp            |   30 +-
 source/backend/metal/CMakeLists.txt           |   18 -
 source/backend/metal/MetalAttention.mm        |   56 +-
 source/backend/metal/MetalConvolution1x1.mm   |    7 +-
 .../backend/metal/MetalConvolutionCommon.mm   |   57 +-
 .../metal/shader/MetalConvolution1x1.metal    |   99 +-
 .../opencl/execution/cl/opencl_program.cc     |    2 +-
 source/core/ConvolutionCommon.cpp             |    1 +
 source/core/IDSTEncoder.hpp                   |   50 +-
 source/core/Interpreter.cpp                   |   31 +-
 source/core/Pipeline.cpp                      |    4 +-
 source/core/Pipeline.hpp                      |    2 +-
 source/core/Session.cpp                       |   40 +-
 source/core/Session.hpp                       |    4 +
 source/core/TensorUtils.cpp                   |  489 +++---
 source/core/TensorUtils.hpp                   |   12 +-
 source/geometry/GeometryBinary.cpp            |  167 +-
 source/geometry/GeometryComputer.cpp          |   84 +-
 source/geometry/GeometryComputer.hpp          |    7 +-
 source/geometry/GeometryComputerUtils.cpp     |   10 +-
 test.sh                                       |   10 +-
 test/CommonOpCreator.hpp                      |    4 +-
 test/MNNTestSuite.cpp                         |   34 +-
 test/core/RegionFuse.cpp                      |   48 +-
 test/expr/MatMulTest.cpp                      |   53 +
 test/op/ConvInt8Test.cpp                      |    2 +-
 test/op/ConvolutionTest.cpp                   |  113 +-
 test/op/ZerosLikeTest.cpp                     |   11 +
 tools/converter/include/config.hpp            |    1 +
 .../source/common/WeightQuantAndCoding.cpp    |   56 +-
 tools/converter/source/common/cli.cpp         |   10 +-
 .../source/common/convertToStaticModel.cpp    |    2 +-
 .../source/optimizer/merge/FuseAttention.cpp  |   29 +-
 .../source/optimizer/merge/MergeHelpers.cpp   |    5 +
 .../source/optimizer/merge/MergeHelpers.hpp   |    1 +
 .../postconvert/RemoveInvalidCast.cpp         |   55 +-
 tools/cpp/ExprDebug.hpp                       |    8 +-
 tools/cpp/ModuleBasic.cpp                     |    7 +
 tools/script/apply_gptq.py                    |  187 +++
 transformers/llm/config.json                  |    9 +
 transformers/llm/engine/include/llm.hpp       |  533 +++---
 transformers/llm/engine/include/tokenizer.hpp |   36 +-
 transformers/llm/engine/llm_demo.cpp          |   30 +-
 transformers/llm/engine/src/llm.cpp           | 1092 +++++--------
 transformers/llm/engine/src/tokenizer.cpp     |  177 +-
 transformers/llm/export/llm_export.py         |  289 +++-
 .../Baichuan2-7B-Chat/modeling_baichuan.py    |   77 +-
 .../Llama-2-7b-chat-ms/modeling_llama.py      |   42 +-
 .../Llama-3-8B-Instruct/modeling_llama.py     |   42 +-
 .../llm_models/MiniCPM-1.2b/config.json       |    0
 .../llm_models/MiniCPM-2.4b/config.json       |    0
 .../llm_models/Qwen-7B-Chat/modeling_qwen.py  |    9 +-
 .../llm_models/Qwen-VL-Chat/modeling_qwen.py  |   14 +-
 .../Qwen1_5-0_5B-Chat/modeling_qwen2.py       |    6 +-
 .../Qwen1_5-1_8B-Chat/modeling_qwen2.py       |    6 +-
 .../Qwen1_5-4B-Chat/modeling_qwen2.py         |    6 +-
 .../Qwen1_5-7B-Chat/modeling_qwen2.py         |    6 +-
 .../Qwen2-0_5B-Instruct/config.json           |   31 +
 .../configuration_qwen2.py                    |  144 ++
 .../Qwen2-0_5B-Instruct/modeling_qwen2.py     | 1436 +++++++++++++++++
 .../Qwen2-1_5B-Instruct/config.json           |   31 +
 .../configuration_qwen2.py                    |  144 ++
 .../Qwen2-1_5B-Instruct/modeling_qwen2.py     | 1436 +++++++++++++++++
 .../export/llm_models/Qwen2-1_5B/config.json  |   31 +
 .../Qwen2-1_5B/configuration_qwen2.py         |  144 ++
 .../llm_models/Qwen2-1_5B/modeling_qwen2.py   | 1434 ++++++++++++++++
 .../llm_models/Qwen2-7B-Instruct/config.json  |   31 +
 .../Qwen2-7B-Instruct/configuration_qwen2.py  |  144 ++
 .../Qwen2-7B-Instruct/modeling_qwen2.py       | 1436 +++++++++++++++++
 .../config.json                               |    0
 .../configuration_llama.py                    |    0
 .../modeling_llama.py                         |   42 +-
 .../llm_models/Yi-6B-Chat/modeling_llama.py   |   42 +-
 .../deepseek-llm-7b-chat/modeling_llama.py    |   42 +-
 .../glm-4-9b-chat/modeling_chatglm.py         | 1238 ++++++++++++++
 .../internlm-chat-7b/modeling_internlm.py     |   39 +-
 110 files changed, 12586 insertions(+), 2772 deletions(-)
 rename docs/compile/{tools.md => other.md} (87%)
 create mode 100644 docs/transformers/diffusion.md
 create mode 100644 docs/transformers/llm.md
 create mode 100644 tools/script/apply_gptq.py
 create mode 100755 transformers/llm/config.json
 mode change 100755 => 100644 transformers/llm/export/llm_models/MiniCPM-1.2b/config.json
 mode change 100755 => 100644 transformers/llm/export/llm_models/MiniCPM-2.4b/config.json
 create mode 100755 transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/config.json
 create mode 100644 transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/configuration_qwen2.py
 create mode 100644 transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/modeling_qwen2.py
 create mode 100755 transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/config.json
 create mode 100644 transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/configuration_qwen2.py
 create mode 100644 transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/modeling_qwen2.py
 create mode 100755 transformers/llm/export/llm_models/Qwen2-1_5B/config.json
 create mode 100644 transformers/llm/export/llm_models/Qwen2-1_5B/configuration_qwen2.py
 create mode 100644 transformers/llm/export/llm_models/Qwen2-1_5B/modeling_qwen2.py
 create mode 100755 transformers/llm/export/llm_models/Qwen2-7B-Instruct/config.json
 create mode 100644 transformers/llm/export/llm_models/Qwen2-7B-Instruct/configuration_qwen2.py
 create mode 100644 transformers/llm/export/llm_models/Qwen2-7B-Instruct/modeling_qwen2.py
 rename transformers/llm/export/llm_models/{TinyLlama-1.1B-Chat => TinyLlama-1_1B-Chat}/config.json (100%)
 rename transformers/llm/export/llm_models/{TinyLlama-1.1B-Chat => TinyLlama-1_1B-Chat}/configuration_llama.py (100%)
 rename transformers/llm/export/llm_models/{TinyLlama-1.1B-Chat => TinyLlama-1_1B-Chat}/modeling_llama.py (95%)
 create mode 100755 transformers/llm/export/llm_models/glm-4-9b-chat/modeling_chatglm.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e012f380..bd2220bc7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,6 +61,8 @@ option(MNN_BUILD_LLM "Build llm library based MNN." OFF)
 option(MNN_BUILD_DIFFUSION "Build diffusion demo based MNN." OFF)
 option(MNN_INTERNAL "Build with MNN internal features, such as model authentication, metrics logging" OFF)
 option(MNN_JNI "Build MNN Jni for java to use" OFF)
+option(MNN_SUPPORT_BF16 "Enable MNN's bf16 op" OFF)
+option(MNN_LOW_MEMORY "Build MNN support low memory for weight quant model." OFF)
 
 IF (OHOS)
   include($ENV{NODE_PATH}/@ali/tcpkg/tcpkg.cmake)
diff --git a/docs/compile/engine.md b/docs/compile/engine.md
index 763202078..eb8eb6503 100644
--- a/docs/compile/engine.md
+++ b/docs/compile/engine.md
@@ -3,14 +3,14 @@
 ## Linux/MacOS
 - 环境要求
   - cmake >= 3.10
-  - gcc >= 4.9
+  - gcc >= 4.9 或者使用 clang
 - 相关编译选项
-  - `MNN_ONEDNN` 是否使用oneDNN库来加速卷积运算
   - `MNN_AVX512` 是否使用AVX512指令，需要gcc9以上版本编译
   - `MNN_OPENCL` 是否使用OpenCL后端，针对GPU设备
+  - `MNN_METAL` 是否使用Metal后端，针对MacOS/iOSGPU设备
   - `MNN_VULKAN` 是否使用Vulkan后端，针对GPU设备
   - `MNN_CUDA`  是否使用CUDA后端，针对Nivida GPU设备
-  - `MNN_TENSORRT` 是否使用TensorRT后端，针对Nivida GPU设备
+  - 其他编译选项可自行查看 CMakeLists.txt
 - 具体步骤
   1. 准备工作 (可选，修改 MNN Schema 后需要）
         ```bash
diff --git a/docs/compile/tools.md b/docs/compile/other.md
similarity index 87%
rename from docs/compile/tools.md
rename to docs/compile/other.md
index f119c397d..d0209f61b 100644
--- a/docs/compile/tools.md
+++ b/docs/compile/other.md
@@ -1,4 +1,4 @@
-# 工具模块编译
+# 其他模块编译
 
 ## 模型转换工具
 - 相关编译选项
@@ -31,6 +31,28 @@
   - `runTrainDemo.out` 运行训练框架demo的入口程序
   - `transformer` 训练模型转换器，将推理用的MNN模型转换为执行训练的MNN模型
   - `extractForInfer` 从执行训练的MNN模型中提取参数，对应更新推理用的MNN模型
+## 生成式模型
+- 相关编译选项
+  - `MNN_BUILD_DIFFUSION` 是否编译扩散模型推理示例
+  - `MNN_BUILD_LLM` 是否编译大语言模型推理引擎
+  - `MNN_SUPPORT_TRANSFORMER_FUSE` 是否支持`transformer`相关的融合算子，主要加速transformer模型
+- 编译命令
+  - 编译扩散模型推理示例
+    ```bash
+    mkdir build && cd build
+    cmake .. -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_BUILD_DIFFUSION=ON -DMNN_SUPPORT_TRANSFORMER_FUSE=ON
+    make -j4
+    ```
+  - 编译大语言模型推理引擎
+    ```bash
+    mkdir build && cd build
+    cmake .. -DMNN_BUILD_LLM=ON -DMNN_SUPPORT_TRANSFORMER_FUSE=ON
+    make -j4
+    ```
+- 编译产物
+  - `libllm.so` 大语言模型推理库
+  - `llm_demo` 大语言模型推理示例程序
+  - `diffusion_demo` 扩散模型示例程序
 ## 测试工具
 - 相关编译选项
   - `MNN_BUILD_TOOL` 是否编译测试工具
diff --git a/docs/index.rst b/docs/index.rst
index ac2730945..8c97f2410 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -31,7 +31,7 @@
 
    compile/cmake
    compile/engine
-   compile/tools
+   compile/other
    compile/pymnn
 
 .. toctree::
@@ -62,6 +62,14 @@
    train/finetune
    train/distl
 
+.. toctree::
+   :maxdepth: 1
+   :caption: 生成式模型
+   :name: transformers
+
+   transformers/diffusion
+   transformers/llm
+
 .. toctree::
    :maxdepth: 1
    :caption: 测试工具
diff --git a/docs/transformers/diffusion.md b/docs/transformers/diffusion.md
new file mode 100644
index 000000000..da22cb304
--- /dev/null
+++ b/docs/transformers/diffusion.md
@@ -0,0 +1,3 @@
+# 扩散模型
+
+TODO
\ No newline at end of file
diff --git a/docs/transformers/llm.md b/docs/transformers/llm.md
new file mode 100644
index 000000000..ea671993b
--- /dev/null
+++ b/docs/transformers/llm.md
@@ -0,0 +1,198 @@
+# 大语言模型
+
+基于MNN开发的LLM推理引擎，支持目前主流的开源LLM模型。该功能分为2部分：
+- 模型导出：将torch模型导出为onnx，然后转换为mnn模型；导出tokenizer文件，embedding等文件；
+- 模型推理：支持导出的模型推理，支持LLM模型的文本生成；
+
+## 模型导出
+
+`llm_export`是一个llm模型导出工具，能够将llm模型导出为onnx和mnn模型。
+
+### 用法
+1. 将需要导出的LLM项目clone到本地，如：Qwen2-0.5B-Instruct
+```sh
+git clone https://www.modelscope.cn/qwen/Qwen2-0.5B-Instruct.git
+```
+3. 执行`llm_export.py`导出模型
+```sh
+cd ./transformers/llm/export
+# 导出模型，tokenizer和embedding，并导出对应的mnn模型
+python llm_export.py \
+        --type Qwen2-0_5B-Instruct \
+        --path /path/to/Qwen2-0.5B-Instruct \
+        --export \
+        --export_token \
+        --export_embed --embed_bin \
+        --export_mnn
+```
+4. 导出产物
+导出产物为：
+1. `embeddings_bf16.bin`: 模型的embedding权重二进制文件，推理时使用；
+2. `llm_config.json`: 模型的配置信息，推理时使用；
+3. `llm.onnx`: 模型的onnx文件，推理时不使用；
+4. `tokenizer.txt`: 模型的tokenzier文件，推理时使用；
+5. `llm.mnn`: 模型的mnn文件，推理时使用；
+6. `llm.mnn.weight`: 模型的mnn权重，推理时使用；
+目录结构如下所示：
+```
+.
+├── onnx
+|    ├── embeddings_bf16.bin
+|    ├── llm_config.json
+|    ├── llm.onnx
+|    └── tokenizer.txt
+└── mnn
+     ├── llm.mnn
+     └── llm.mnn.weight
+```
+
+### 功能
+- 支持将模型完整导出为一个onnx模型，使用`--export`
+- 支持将模型分段导出为多个模型，使用`--export_split`
+- 支持导出模型的词表到一个文本文件，每行代表一个token；其中token使用base64编码；使用`--export_verbose`
+- 支持导出模型的Embedding层为一个onnx模型，使用`--export_embed`，同时支持bf16格式，使用`--embed_bf16`
+- 支持分层导出模型的block，使用`--export_blocks`导出全部层；使用`--export_block $id`导出指定层
+- 支持导出模型的lm_head层为一个onnx模型，使用`--export_lm`
+- 支持导出多模态模型的visual模型为一个onnx模型，使用`--export_visual`
+- 支持对模型进行对话测试，使用`--test $query`会返回llm的回复内容
+- 支持在导出onnx模型后使用onnxruntime对结果一致性进行校验，使用`--export_test`
+- 支持将tokenizer导出为文本文件，使用`--export_token`
+- 支持将导出的onnx模型转换为mnn模型，默认转换为非对称4bit量化，使用`--export_mnn`
+- 指定导出路径使用`--onnx_path`和`--mnn_path`
+- 默认会使用onnx-slim对onnx模型进行优化，跳过该步骤使用`--skip_slim`
+- 支持合并lora权重后导出，指定lora权重的目录使用`--lora_path`
+
+### 参数
+```
+usage: llm_export.py [-h] --path PATH
+                     [--type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-1_8B,Qwen-VL-Chat,Qwen1_5-0_5B-Chat,Qwen1_5-1_8B-Chat,Qwen1_5-4B-Chat,Qwen1_5-7B-Chat,Qwen2-1_5B-Instruct,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,Llama-3-8B-Instruct,internlm-chat-7b,TinyLlama-1_1B-Chat,Yi-6B-Chat,deepseek-llm-7b-chat,phi-2,bge-large-zh,lora}]
+                     [--lora_path LORA_PATH] [--onnx_path ONNX_PATH] [--mnn_path MNN_PATH] [--export_mnn] [--export_verbose] [--export_test] [--test TEST] [--export] [--export_split] [--export_token]
+                     [--export_embed] [--export_visual] [--export_lm] [--export_block EXPORT_BLOCK] [--export_blocks] [--embed_bin] [--embed_bf16] [--skip_slim]
+
+llm_exporter
+
+options:
+  -h, --help            show this help message and exit
+  --path PATH           path(`str` or `os.PathLike`):
+                        Can be either:
+                        	- A string, the *model id* of a pretrained model like `THUDM/chatglm-6b`. [TODO]
+                        	- A path to a *directory* clone from repo like `../chatglm-6b`.
+  --type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-1_8B,Qwen-VL-Chat,Qwen1_5-0_5B-Chat,Qwen1_5-1_8B-Chat,Qwen1_5-4B-Chat,Qwen1_5-7B-Chat,Qwen2-1_5B-Instruct,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,Llama-3-8B-Instruct,internlm-chat-7b,TinyLlama-1_1B-Chat,Yi-6B-Chat,deepseek-llm-7b-chat,phi-2,bge-large-zh,lora}
+                        type(`str`, *optional*):
+                        	The pretrain llm model type.
+  --lora_path LORA_PATH
+                        lora path, defaut is `None` mean not apply lora.
+  --onnx_path ONNX_PATH
+                        export onnx model path, defaut is `./onnx`.
+  --mnn_path MNN_PATH   export mnn model path, defaut is `./mnn`.
+  --export_mnn          Whether or not to export mnn model after onnx.
+  --export_verbose      Whether or not to export onnx with verbose.
+  --export_test         Whether or not to export onnx with test using onnxruntime.
+  --test TEST           test model inference with query `TEST`.
+  --export              export model to an `onnx` model.
+  --export_split        export model split to some `onnx` models:
+                        	- embedding model.
+                        	- block models.
+                        	- lm_head model.
+  --export_token        export llm tokenizer to a txt file.
+  --export_embed        export llm embedding to an `onnx` model.
+  --export_visual       export llm visual model to an `onnx` model.
+  --export_lm           export llm lm_head to an `onnx` model.
+  --export_block EXPORT_BLOCK
+                        export llm block [id] to an `onnx` model.
+  --export_blocks       export llm all blocks to `onnx` models.
+  --embed_bin           export embedding weight as bin file with dtype `bfloat16`
+  --embed_bf16          using `bfloat16` replace `float32` in embedding.
+  --skip_slim           Whether or not to skip onnx-slim.
+```
+
+## 模型推理
+
+### 编译
+
+[从源码编译](../compile/tools.html#id4)
+
+### 使用
+#### 运行时配置
+
+##### 运行时文件
+将导出产物中用于模型推理的部分置于同一个文件夹下，添加一个配置文件`config.json`来描述模型名称与推理参数，目录如下：
+```
+.
+└── model_dir
+     ├── config.json
+     ├── embeddings_bf16.bin
+     ├── llm_config.json
+     ├── llm.mnn
+     ├── llm.mnn.weight
+     └── tokenizer.txt
+```
+
+##### 配置项
+配置文件支持以下配置：
+- 模型文件信息
+  - base_dir: 模型文件加载的文件夹目录，默认为config.json的所在目录，或模型所在目录；
+  - llm_config: `llm_config.json`的实际名称路径为`base_dir + llm_config`，默认为`base_dir + 'config.json'`
+  - llm_model: `llm.mnn`的实际名称路径为`base_dir + llm_model`，默认为`base_dir + 'llm.mnn'`
+  - llm_weight: `llm.mnn.weight`的实际名称路径为`base_dir + llm_weight`，默认为`base_dir + 'llm.mnn.weight'`
+  - block_model: 分段模型时`block_{idx}.mnn`的实际路径为`base_dir + block_model`，默认为`base_dir + 'block_{idx}.mnn'`
+  - lm_model: 分段模型时`lm.mnn`的实际路径为`base_dir + lm_model`，默认为`base_dir + 'lm.mnn'`
+  - embedding_model: 当embedding使用模型时，embedding的实际路径为`base_dir + embedding_model`，默认为`base_dir + 'embedding.mnn'`
+  - embedding_file: 当embedding使用二进制时，embedding的实际路径为`base_dir + embedding_file`，默认为`base_dir + 'embeddings_bf16.bin'`
+  - tokenizer_file: `tokenizer.txt`的实际名称路径为`base_dir + tokenizer_file`，默认为`base_dir + 'tokenizer.txt'`
+  - visual_model: 当使用VL模型时，visual_model的实际路径为`base_dir + visual_model`，默认为`base_dir + 'visual.mnn'`
+- 推理配置
+  - max_new_tokens: 生成时最大token数，默认为`512`
+- 硬件配置
+  - backend_type: 推理使用硬件后端类型，默认为：`"cpu"`
+  - thread_num: 推理使用硬件线程数，默认为：`4`
+  - precision: 推理使用精度策略，默认为：`"low"`，尽量使用`fp16`
+  - memory: 推理使用内存策略，默认为：`"low"`，开启运行时量化
+
+##### 配置文件示例
+- `config.json`
+  ```json
+  {
+      "llm_model": "qwen2-1.5b-int4.mnn",
+      "llm_weight": "qwen2-1.5b-int4.mnn.weight",
+
+      "backend_type": "cpu",
+      "thread_num": 4,
+      "precision": "low",
+      "memory": "low"
+  }
+  ```
+- `llm_config.json`
+  ```json
+  {
+      "hidden_size": 1536,
+      "layer_nums": 28,
+      "attention_mask": "float",
+      "key_value_shape": [
+          2,
+          1,
+          0,
+          2,
+          128
+      ],
+      "prompt_template": "<|im_start|>user\n%s<|im_end|>\n<|im_start|>assistant\n",
+      "is_visual": false,
+      "is_single": true
+  }
+  ```
+
+#### 推理用法
+`llm_demo`的用法如下：
+```
+# 使用config.json
+## 交互式聊天
+./llm_demo model_dir/config.json
+## 针对prompt中的每行进行回复
+./llm_demo model_dir/config.json prompt.txt
+
+# 不使用config.json, 使用默认配置
+## 交互式聊天
+./llm_demo model_dir/llm.mnn
+## 针对prompt中的每行进行回复
+./llm_demo model_dir/llm.mnn prompt.txt
+```
\ No newline at end of file
diff --git a/express/Executor.cpp b/express/Executor.cpp
index 93f0fd486..0edb9d6ad 100644
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@@ -243,38 +243,10 @@ void Executor::RuntimeManager::destroy(RuntimeManager* rtmgr) {
 }
 
 void Executor::RuntimeManager::setMode(Interpreter::SessionMode mode) {
-    if (mode == Interpreter::Session_Input_Inside || mode == Interpreter::Session_Input_User) {
-        mInside->modes.inputMode = mode;
-    } else if (mode == Interpreter::Session_Output_User || mode == Interpreter::Session_Output_Inside) {
-        mInside->modes.outputMode = mode;
-    } else if (mode == Interpreter::Session_Backend_Auto || mode == Interpreter::Session_Backend_Fix) {
-        mInside->modes.backendMode = mode;
-    } else if (mode == Interpreter::Session_Debug || mode == Interpreter::Session_Release) {
-        mInside->modes.callBackMode = mode;
-    } else if (mode == Interpreter::Session_Resize_Direct || mode == Interpreter::Session_Resize_Defer) {
-        mInside->modes.resizeMode = mode;
-    } else if(mode == Interpreter::Session_Memory_Collect || mode == Interpreter::Session_Memory_Cache) {
-        mInside->modes.memoryUsageMode = mode;
-    } else if(mode == Interpreter::Session_Codegen_Disable || mode == Interpreter::Session_Codegen_Enable) {
-        mInside->modes.codegenMode = mode;
-    }
+    mInside->modes.setMode(mode);
 }
 void Executor::RuntimeManager::setHint(Interpreter::HintMode mode, int value) {
-    switch (mode) {
-        case Interpreter::MAX_TUNING_NUMBER:
-            mInside->modes.maxTuningNumber = value;
-            break;
-        case Interpreter::STRICT_CHECK_MODEL:
-            mInside->checkNetBuffer = value > 0;
-            break;
-        case Interpreter::MEM_ALLOCATOR_TYPE:
-            mInside->modes.memoryAllocatorType = value;
-            break;
-        case Interpreter::WINOGRAD_MEMORY_LEVEL:
-            mInside->modes.winogradMemoryUsed = value;
-        default:
-            break;
-    }
+    mInside->modes.setHint(mode, value);
 }
 bool Executor::RuntimeManager::getInfo(Interpreter::SessionInfoCode code, void* ptr) {
     // Only support get memory
diff --git a/express/Expr.cpp b/express/Expr.cpp
index 4ff29a59c..aa664ad24 100644
--- a/express/Expr.cpp
+++ b/express/Expr.cpp
@@ -372,7 +372,7 @@ VARP Variable::create(EXPRP expr, int index) {
         res.fix(VARP::CONSTANT);
         return res;
     }
-    // CONTENT Mode
+    // CONTENT Mode, Use Geometry Computer to Decompress Expr
     do {
         if (!(executor->getLazyMode() & Executor::LAZY_CONTENT)) {
             break;
@@ -398,7 +398,8 @@ VARP Variable::create(EXPRP expr, int index) {
             outputTensors[i] = expr->mInside->mOutputTensors[i];
         }
         auto bn = executor->getAttr()->constantBackend;
-        GeometryComputer::Context context(bn);
+        // TODO: Support set mask
+        GeometryComputer::Context context(Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_ALL, bn);
         auto geo = GeometryComputer::search(expr->get()->type(), Runtime::Compiler_Loop);
         CommandBuffer cmd;
         res = geo->onCompute(expr->get(), inputTensors, outputTensors, context, cmd);
diff --git a/express/RuntimeAttr.hpp b/express/RuntimeAttr.hpp
index 0aef32824..3272cde95 100644
--- a/express/RuntimeAttr.hpp
+++ b/express/RuntimeAttr.hpp
@@ -21,7 +21,6 @@ struct RuntimeAttr {
     // Use for static module to compute flops
     float mFlops;
     std::string mExternalFile;
-    bool checkNetBuffer = true;
 };
 struct ExecutorAttr {
     std::shared_ptr<Backend> constantBackend;
diff --git a/express/module/Module.cpp b/express/module/Module.cpp
index d56c944cd..00b0a63bc 100644
--- a/express/module/Module.cpp
+++ b/express/module/Module.cpp
@@ -351,7 +351,7 @@ static Module* loadInternal(const std::vector<std::string>& inputs, const std::v
     }
     bool checkMNNBuffer = true;
     if (nullptr != _rtMgr) {
-        checkMNNBuffer = _rtMgr->getInside()->checkNetBuffer;
+        checkMNNBuffer = _rtMgr->getInside()->modes.checkNetBuffer;
     }
     if (checkMNNBuffer) {
         flatbuffers::Verifier verify(buffer, length);
diff --git a/include/MNN/Interpreter.hpp b/include/MNN/Interpreter.hpp
index 5a6e235fb..16344a52b 100644
--- a/include/MNN/Interpreter.hpp
+++ b/include/MNN/Interpreter.hpp
@@ -203,7 +203,28 @@ class MNN_PUBLIC Interpreter {
         MEM_ALLOCATOR_TYPE = 2,
         // Winograd unit candidates count, default 3. if set 0, will use less unit candidates for less memory at the expense of performance.
         WINOGRAD_MEMORY_LEVEL = 3,
+
+        // Geometry Compute option, default is 0xFFFF
+        GEOMETRY_COMPUTE_MASK = 4,
+    };
+
+    enum GeometryComputeMask {
+        // Support Region Fuse
+        GEOMETRCOMPUTEMASK_FUSEREGION = 1 << 0,
+
+        // Support Region Fuse to input with multi-region, eg: pad + concat
+        GEOMETRCOMPUTEMASK_FUSEREGION_MULTI = 1 << 1,
+
+        // Use loop instead of raster + compute if possible
+        GEOMETRCOMPUTEMASK_USELOOP = 1 << 2,
+        
+        // Support Geometry Cache, if shape changed, will try recompute, and then run compute if failed
+        GEOMETRCOMPUTEMASK_OPENCACHE = 1 << 3,
+        
+        // Full option open mask, for example, if want to close useloop, can set mask as (GEOMETRCOMPUTEMASK_ALL - GEOMETRCOMPUTEMASK_USELOOP)
+        GEOMETRCOMPUTEMASK_ALL = 0xFFFF,
     };
+
     /**
      * @brief The API shoud be called before create session.
      * @param mode      Hint type
diff --git a/pymnn/src/util.h b/pymnn/src/util.h
index f3c855578..bd33cc895 100644
--- a/pymnn/src/util.h
+++ b/pymnn/src/util.h
@@ -667,23 +667,45 @@ inline bool getScheduleConfig(PyObject* dict, MNN::ScheduleConfig &config) {
             }
             config.numThread = (int)toInt(numThread);
         }
-
         {
-            //precision
-            PyObject *obj = PyDict_GetItemString(dict, "precision");
+            //power
+            PyObject *obj = PyDict_GetItemString(dict, "power");
             if (obj) {
-                auto obj_name = object2String(obj);
-                if (!obj_name.compare("low")) {
-                    MNN_PRINT("MNN use low precision\n");
-                    backendConfig->precision = MNN::BackendConfig::Precision_Low;
+                if (isInt(obj)) {
+                    backendConfig->power = (MNN::BackendConfig::PowerMode)toInt(obj);
                 }
-                if (!obj_name.compare("Low_BF16")) {
-                    MNN_PRINT("MNN use lowBF precision\n");
-                    backendConfig->precision = MNN::BackendConfig::Precision_Low_BF16;
+            }
+        }
+        {
+            //memory
+            PyObject *obj = PyDict_GetItemString(dict, "memory");
+            if (obj) {
+                if (isInt(obj)) {
+                    backendConfig->memory = (MNN::BackendConfig::MemoryMode)toInt(obj);
                 }
-                if (!obj_name.compare("high")) {
-                    MNN_PRINT("MNN use high precision\n");
-                    backendConfig->precision = MNN::BackendConfig::Precision_High;
+            }
+        }
+        {
+            //precision
+            PyObject *obj = PyDict_GetItemString(dict, "precision");
+            if (obj) {
+                if (isInt(obj)) {
+                    backendConfig->precision = (MNN::BackendConfig::PrecisionMode)toInt(obj);
+                } else {
+                    // For compability
+                    auto obj_name = object2String(obj);
+                    if (!obj_name.compare("low")) {
+                        MNN_PRINT("MNN use low precision\n");
+                        backendConfig->precision = MNN::BackendConfig::Precision_Low;
+                    }
+                    if (!obj_name.compare("Low_BF16")) {
+                        MNN_PRINT("MNN use lowBF precision\n");
+                        backendConfig->precision = MNN::BackendConfig::Precision_Low_BF16;
+                    }
+                    if (!obj_name.compare("high")) {
+                        MNN_PRINT("MNN use high precision\n");
+                        backendConfig->precision = MNN::BackendConfig::Precision_High;
+                    }
                 }
             }
         }
diff --git a/source/backend/arm82/asm/arm64/MNNPackedMatMulFP16.S b/source/backend/arm82/asm/arm64/MNNPackedMatMulFP16.S
index 7d1caf9f4..f1a462b93 100644
--- a/source/backend/arm82/asm/arm64/MNNPackedMatMulFP16.S
+++ b/source/backend/arm82/asm/arm64/MNNPackedMatMulFP16.S
@@ -14,7 +14,7 @@
 // 8 * 24 MatMul
 asm_function MNNPackedMatMulFP16
 //void MNNPackedMatMulFP16(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, const size_t* parameter, const FLOAT16* postParameters, const FLOAT16* bias);
-// x0: C, x1:A, x2:B, x3:parameter, x5: postParameters, x6:bias
+// x0: C, x1:A, x2:B, x3:parameter, x4: postParameters, x5:bias
 stp d14, d15, [sp, #-64]!
 stp d12, d13, [sp, #16]
 stp d10, d11, [sp, #32]
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S
index 72f7e12bd..118b4f104 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S
@@ -11,25 +11,106 @@
 
 .text
 .align 5
+
+.macro FMLA_8 d0, d1, d2, d3, d4, d5, d6, d7, s0, s1
+    fmla \d0\().8h, \s0\().8h, \s1\().h[0]
+    fmla \d1\().8h, \s0\().8h, \s1\().h[1]
+    fmla \d2\().8h, \s0\().8h, \s1\().h[2]
+    fmla \d3\().8h, \s0\().8h, \s1\().h[3]
+    fmla \d4\().8h, \s0\().8h, \s1\().h[4]
+    fmla \d5\().8h, \s0\().8h, \s1\().h[5]
+    fmla \d6\().8h, \s0\().8h, \s1\().h[6]
+    fmla \d7\().8h, \s0\().8h, \s1\().h[7]
+.endm
+
+.macro FMLA_4 d0, d1, d2, d3, s0, s1
+    fmla \d0\().8h, \s0\().8h, \s1\().h[0]
+    fmla \d1\().8h, \s0\().8h, \s1\().h[1]
+    fmla \d2\().8h, \s0\().8h, \s1\().h[2]
+    fmla \d3\().8h, \s0\().8h, \s1\().h[3]
+.endm
+
+.macro FMUL_8 d0, d1, d2, d3, d4, d5, d6, d7, s0, s1
+    fmul \d0\().8h, \s0\().8h, \s1\().h[0]
+    fmul \d1\().8h, \s0\().8h, \s1\().h[1]
+    fmul \d2\().8h, \s0\().8h, \s1\().h[2]
+    fmul \d3\().8h, \s0\().8h, \s1\().h[3]
+    fmul \d4\().8h, \s0\().8h, \s1\().h[4]
+    fmul \d5\().8h, \s0\().8h, \s1\().h[5]
+    fmul \d6\().8h, \s0\().8h, \s1\().h[6]
+    fmul \d7\().8h, \s0\().8h, \s1\().h[7]
+.endm
+
+.macro FMUL_4 d0, d1, d2, d3, s0, s1
+    fmul \d0\().8h, \s0\().8h, \s1\().h[0]
+    fmul \d1\().8h, \s0\().8h, \s1\().h[1]
+    fmul \d2\().8h, \s0\().8h, \s1\().h[2]
+    fmul \d3\().8h, \s0\().8h, \s1\().h[3]
+.endm
+
+.macro FADD_12 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, z0
+    fadd \d0\().8h, \d0\().8h, \z0\().8h
+    fadd \d1\().8h, \d1\().8h, \z0\().8h
+    fadd \d2\().8h, \d2\().8h, \z0\().8h
+    fadd \d3\().8h, \d3\().8h, \z0\().8h
+    fadd \d4\().8h, \d4\().8h, \z0\().8h
+    fadd \d5\().8h, \d5\().8h, \z0\().8h
+    fadd \d6\().8h, \d6\().8h, \z0\().8h
+    fadd \d7\().8h, \d7\().8h, \z0\().8h
+    fadd \d8\().8h, \d8\().8h, \z0\().8h
+    fadd \d9\().8h, \d9\().8h, \z0\().8h
+    fadd \d10\().8h, \d10\().8h, \z0\().8h
+    fadd \d11\().8h, \d11\().8h, \z0\().8h
+.endm
+
+.macro FMAX_12 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, z0
+    fmax \d0\().8h, \d0\().8h, \z0\().8h
+    fmax \d1\().8h, \d1\().8h, \z0\().8h
+    fmax \d2\().8h, \d2\().8h, \z0\().8h
+    fmax \d3\().8h, \d3\().8h, \z0\().8h
+    fmax \d4\().8h, \d4\().8h, \z0\().8h
+    fmax \d5\().8h, \d5\().8h, \z0\().8h
+    fmax \d6\().8h, \d6\().8h, \z0\().8h
+    fmax \d7\().8h, \d7\().8h, \z0\().8h
+    fmax \d8\().8h, \d8\().8h, \z0\().8h
+    fmax \d9\().8h, \d9\().8h, \z0\().8h
+    fmax \d10\().8h, \d10\().8h, \z0\().8h
+    fmax \d11\().8h, \d11\().8h, \z0\().8h
+.endm
+
+.macro FMIN_12 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, z0
+    fmin \d0\().8h, \d0\().8h, \z0\().8h
+    fmin \d1\().8h, \d1\().8h, \z0\().8h
+    fmin \d2\().8h, \d2\().8h, \z0\().8h
+    fmin \d3\().8h, \d3\().8h, \z0\().8h
+    fmin \d4\().8h, \d4\().8h, \z0\().8h
+    fmin \d5\().8h, \d5\().8h, \z0\().8h
+    fmin \d6\().8h, \d6\().8h, \z0\().8h
+    fmin \d7\().8h, \d7\().8h, \z0\().8h
+    fmin \d8\().8h, \d8\().8h, \z0\().8h
+    fmin \d9\().8h, \d9\().8h, \z0\().8h
+    fmin \d10\().8h, \d10\().8h, \z0\().8h
+    fmin \d11\().8h, \d11\().8h, \z0\().8h
+.endm
+
 // 8 * 24 MatMul
 asm_function MNNPackedMatMulFP16_int4
 //void MNNPackedMatMulFP16_int4(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, const size_t* parameter, const FLOAT16* postParameters, const FLOAT16* bias, const FLOAT16* k, const FLOAT16* b);
 // x0: C, x1:A, x2:B, x3:parameter, x4: postParameters, x5:bias, x6: quant_alpha, x7: quant_bias
-stp d14, d15, [sp, #-80]!
+stp d14, d15, [sp, #-96]!
 stp d12, d13, [sp, #16]
 stp d10, d11, [sp, #32]
 stp d8,  d9,  [sp, #48]
 stp x19, x20, [sp, #64]
+stp x21, x22, [sp, #80]
 
 ldr x9, [x3, #8] // l
 ldr x10, [x3, #16] // h
 
 ldr x13, [x3, #24] // cStride
 ldr x11, [x3, #40] // bExtraStride
-
-// v0, v1, v2: A
-// v3, v4: B
-// v8 - v31: C
+ldr x19, [x3, #48] // blockId
+mov x20, x0
 add x10, x10, #7
 lsr x10, x10, #3
 
@@ -42,6 +123,7 @@ LH8:
 sub x14, x13, #128
 LoopH:
     mov x15, x1
+    mov x22, x2
     ld1 {v4.8h, v5.8h}, [x6], #32 // alpha
     ld1 {v6.8h, v7.8h}, [x7], #32 // bias
     subs x12, x9, #1
@@ -62,35 +144,34 @@ LoopH:
     mov v1.8h, v6.8h
     fmla v1.8h, v0.8h, v4.8h
 
+    cbnz x19, LH8_BLOCK_GT_0
+
+    LH8_BLOCK0:
     ld1 {v0.8h}, [x15], #16
-    fmul v8.8h, v1.8h, v0.h[0]
-    fmul v9.8h, v1.8h, v0.h[1]
-    fmul v10.8h, v1.8h, v0.h[2]
-    fmul v11.8h, v1.8h, v0.h[3]
-    fmul v12.8h, v1.8h, v0.h[4]
-    fmul v13.8h, v1.8h, v0.h[5]
-    fmul v14.8h, v1.8h, v0.h[6]
-    fmul v15.8h, v1.8h, v0.h[7]
-
-    fmul v20.8h, v2.8h, v0.h[0]
-    fmul v21.8h, v2.8h, v0.h[1]
-    fmul v22.8h, v2.8h, v0.h[2]
-    fmul v23.8h, v2.8h, v0.h[3]
-    fmul v24.8h, v2.8h, v0.h[4]
-    fmul v25.8h, v2.8h, v0.h[5]
-    fmul v26.8h, v2.8h, v0.h[6]
-    fmul v27.8h, v2.8h, v0.h[7]
+    FMUL_8 v8, v9, v10, v11, v12, v13, v14, v15, v1, v0
+    FMUL_8 v20, v21, v22, v23, v24, v25, v26, v27, v2, v0
+    ld1 {v0.4h}, [x15], #8
+    FMUL_4 v16, v17, v18, v19, v1, v0
+    FMUL_4 v28, v29, v30, v31, v2, v0
+    b LH8_INIT_END
+
+    LH8_BLOCK_GT_0:
+    ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x20], #64
+    ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x20], #64
+    ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x20], x14
 
+    ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x20], #64
+    ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x20], #64
+    ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x20], x14
+
+    ld1 {v0.8h}, [x15], #16
+    FMLA_8 v8, v9, v10, v11, v12, v13, v14, v15, v1, v0
+    FMLA_8 v20, v21, v22, v23, v24, v25, v26, v27, v2, v0
     ld1 {v0.4h}, [x15], #8
-    fmul v16.8h, v1.8h, v0.h[0]
-    fmul v17.8h, v1.8h, v0.h[1]
-    fmul v18.8h, v1.8h, v0.h[2]
-    fmul v19.8h, v1.8h, v0.h[3]
-    fmul v28.8h, v2.8h, v0.h[0]
-    fmul v29.8h, v2.8h, v0.h[1]
-    fmul v30.8h, v2.8h, v0.h[2]
-    fmul v31.8h, v2.8h, v0.h[3]
+    FMLA_4 v16, v17, v18, v19, v1, v0
+    FMLA_4 v28, v29, v30, v31, v2, v0
 
+    LH8_INIT_END:
     beq LoopLEnd
 
     LoopL1:
@@ -112,34 +193,11 @@ LoopH:
         fmla v1.8h, v0.8h, v4.8h
 
         ld1 {v0.8h}, [x15], #16
-        fmla v8.8h, v1.8h, v0.h[0]
-        fmla v9.8h, v1.8h, v0.h[1]
-        fmla v10.8h, v1.8h, v0.h[2]
-        fmla v11.8h, v1.8h, v0.h[3]
-        fmla v12.8h, v1.8h, v0.h[4]
-        fmla v13.8h, v1.8h, v0.h[5]
-        fmla v14.8h, v1.8h, v0.h[6]
-        fmla v15.8h, v1.8h, v0.h[7]
-
-        fmla v20.8h, v2.8h, v0.h[0]
-        fmla v21.8h, v2.8h, v0.h[1]
-        fmla v22.8h, v2.8h, v0.h[2]
-        fmla v23.8h, v2.8h, v0.h[3]
-        fmla v24.8h, v2.8h, v0.h[4]
-        fmla v25.8h, v2.8h, v0.h[5]
-        fmla v26.8h, v2.8h, v0.h[6]
-        fmla v27.8h, v2.8h, v0.h[7]
-
+        FMLA_8 v8, v9, v10, v11, v12, v13, v14, v15, v1, v0
+        FMLA_8 v20, v21, v22, v23, v24, v25, v26, v27, v2, v0
         ld1 {v0.4h}, [x15], #8
-        fmla v16.8h, v1.8h, v0.h[0]
-        fmla v17.8h, v1.8h, v0.h[1]
-        fmla v18.8h, v1.8h, v0.h[2]
-        fmla v19.8h, v1.8h, v0.h[3]
-        fmla v28.8h, v2.8h, v0.h[0]
-        fmla v29.8h, v2.8h, v0.h[1]
-        fmla v30.8h, v2.8h, v0.h[2]
-        fmla v31.8h, v2.8h, v0.h[3]
-
+        FMLA_4 v16, v17, v18, v19, v1, v0
+        FMLA_4 v28, v29, v30, v31, v2, v0
         bne LoopL1
     
     LoopLEnd:
@@ -148,95 +206,24 @@ LoopH:
     sub x10, x10, #2
     cmp x10, #2
 
-    cbz x4, StoreLH8
+    cbz x4, StoreLH8 // If postParameter* is nullptr, not the last blockId, just store the intemediate results.
 
-    AddBiasLH8:
     ld1 {v5.8h}, [x4]
     fcvtn v5.4h, v5.4s
     dup v6.8h, v5.h[2] // Min Value
     dup v7.8h, v5.h[3] // Max Value
-    ld1 {v0.8h, v1.8h}, [x5], #32
-
-    fmla v8.8h, v0.8h, v5.h[1]
-    fmla v9.8h, v0.8h, v5.h[1]
-    fmla v10.8h, v0.8h, v5.h[1]
-    fmla v11.8h, v0.8h, v5.h[1]
-
-    fmla v12.8h, v0.8h, v5.h[1]
-    fmla v13.8h, v0.8h, v5.h[1]
-    fmla v14.8h, v0.8h, v5.h[1]
-    fmla v15.8h, v0.8h, v5.h[1]
-
-    fmla v16.8h, v0.8h, v5.h[1]
-    fmla v17.8h, v0.8h, v5.h[1]
-    fmla v18.8h, v0.8h, v5.h[1]
-    fmla v19.8h, v0.8h, v5.h[1]
-
-    fmla v20.8h, v1.8h, v5.h[1]
-    fmla v21.8h, v1.8h, v5.h[1]
-    fmla v22.8h, v1.8h, v5.h[1]
-    fmla v23.8h, v1.8h, v5.h[1]
-
-    fmla v24.8h, v1.8h, v5.h[1]
-    fmla v25.8h, v1.8h, v5.h[1]
-    fmla v26.8h, v1.8h, v5.h[1]
-    fmla v27.8h, v1.8h, v5.h[1]
-
-    fmla v28.8h, v1.8h, v5.h[1]
-    fmla v29.8h, v1.8h, v5.h[1]
-    fmla v30.8h, v1.8h, v5.h[1]
-    fmla v31.8h, v1.8h, v5.h[1]
+
+    AddBiasLH8:
+    cbz x5, PostTreatLH8
+    ld1 {v0.8h, v1.8h}, [x5], #32 // gemm bias
+    FADD_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v0
+    FADD_12 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v1
 
     PostTreatLH8:
-    fmax v8.8h, v8.8h, v6.8h
-    fmax v9.8h, v9.8h, v6.8h
-    fmax v10.8h, v10.8h, v6.8h
-    fmax v11.8h, v11.8h, v6.8h
-    fmax v12.8h, v12.8h, v6.8h
-    fmax v13.8h, v13.8h, v6.8h
-    fmax v14.8h, v14.8h, v6.8h
-    fmax v15.8h, v15.8h, v6.8h
-    fmax v16.8h, v16.8h, v6.8h
-    fmax v17.8h, v17.8h, v6.8h
-    fmax v18.8h, v18.8h, v6.8h
-    fmax v19.8h, v19.8h, v6.8h
-    fmax v20.8h, v20.8h, v6.8h
-    fmax v21.8h, v21.8h, v6.8h
-    fmax v22.8h, v22.8h, v6.8h
-    fmax v23.8h, v23.8h, v6.8h
-    fmax v24.8h, v24.8h, v6.8h
-    fmax v25.8h, v25.8h, v6.8h
-    fmax v26.8h, v26.8h, v6.8h
-    fmax v27.8h, v27.8h, v6.8h
-    fmax v28.8h, v28.8h, v6.8h
-    fmax v29.8h, v29.8h, v6.8h
-    fmax v30.8h, v30.8h, v6.8h
-    fmax v31.8h, v31.8h, v6.8h
-
-    fmin v8.8h,  v8.8h,  v7.8h
-    fmin v9.8h,  v9.8h,  v7.8h
-    fmin v10.8h, v10.8h, v7.8h
-    fmin v11.8h, v11.8h, v7.8h
-    fmin v12.8h, v12.8h, v7.8h
-    fmin v13.8h, v13.8h, v7.8h
-    fmin v14.8h, v14.8h, v7.8h
-    fmin v15.8h, v15.8h, v7.8h
-    fmin v16.8h, v16.8h, v7.8h
-    fmin v17.8h, v17.8h, v7.8h
-    fmin v18.8h, v18.8h, v7.8h
-    fmin v19.8h, v19.8h, v7.8h
-    fmin v20.8h, v20.8h, v7.8h
-    fmin v21.8h, v21.8h, v7.8h
-    fmin v22.8h, v22.8h, v7.8h
-    fmin v23.8h, v23.8h, v7.8h
-    fmin v24.8h, v24.8h, v7.8h
-    fmin v25.8h, v25.8h, v7.8h
-    fmin v26.8h, v26.8h, v7.8h
-    fmin v27.8h, v27.8h, v7.8h
-    fmin v28.8h, v28.8h, v7.8h
-    fmin v29.8h, v29.8h, v7.8h
-    fmin v30.8h, v30.8h, v7.8h
-    fmin v31.8h, v31.8h, v7.8h
+    FMAX_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v6
+    FMAX_12 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v6
+    FMIN_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v7
+    FMIN_12 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v7
 
     StoreLH8:
 
@@ -253,6 +240,7 @@ LoopH:
 LH4:
 cbz x10, End
 LoopHRemain:
+    //mov x22, x2
     mov x15, x1
     subs x12, x9, #1
     ld1 {v20.8h}, [x6], #16 // alpha
@@ -270,6 +258,15 @@ LoopHRemain:
     fmla v3.8h, v6.8h, v20.8h
 
     ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24
+    cbnz x19, LH4_BLOCK_GT_0
+
+    LH4_BLOCK0:
+    
+    FMUL_4 v8, v9, v10, v11, v3, v0
+    FMUL_4 v12, v13, v14, v15, v3, v1
+    FMUL_4 v16, v17, v18, v19, v3, v2
+    b LH4_INIT_END
+    /* 
     fmul v8.8h, v3.8h, v0.h[0]
     fmul v9.8h, v3.8h, v0.h[1]
     fmul v10.8h, v3.8h, v0.h[2]
@@ -282,7 +279,17 @@ LoopHRemain:
     fmul v17.8h, v3.8h, v2.h[1]
     fmul v18.8h, v3.8h, v2.h[2]
     fmul v19.8h, v3.8h, v2.h[3]
+    */
 
+    LH4_BLOCK_GT_0:
+    ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x20], #64
+    ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x20], #64
+    ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x20]
+    FMLA_4 v8, v9, v10, v11, v3, v0
+    FMLA_4 v12, v13, v14, v15, v3, v1
+    FMLA_4 v16, v17, v18, v19, v3, v2
+
+    LH4_INIT_END:
     beq LoopLREnd
 
     LoopLR:
@@ -300,86 +307,41 @@ LoopHRemain:
         fmla v3.8h, v6.8h, v20.8h
 
         ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24
-        fmla v8.8h, v3.8h, v0.h[0]
-        fmla v9.8h, v3.8h, v0.h[1]
-        fmla v10.8h, v3.8h, v0.h[2]
-        fmla v11.8h, v3.8h, v0.h[3]
-        fmla v12.8h, v3.8h, v1.h[0]
-        fmla v13.8h, v3.8h, v1.h[1]
-        fmla v14.8h, v3.8h, v1.h[2]
-        fmla v15.8h, v3.8h, v1.h[3]
-        fmla v16.8h, v3.8h, v2.h[0]
-        fmla v17.8h, v3.8h, v2.h[1]
-        fmla v18.8h, v3.8h, v2.h[2]
-        fmla v19.8h, v3.8h, v2.h[3]
-
+        FMLA_4 v8, v9, v10, v11, v3, v0
+        FMLA_4 v12, v13, v14, v15, v3, v1
+        FMLA_4 v16, v17, v18, v19, v3, v2
         bne LoopLR
     LoopLREnd:
 
     cbz x4, StoreLH4
-    AddBiasLH4:
+    
     ld1 {v5.8h}, [x4]
     fcvtn v5.4h, v5.4s
     dup v6.8h, v5.h[2] // Min Value
     dup v7.8h, v5.h[3] // Max Value
+    AddBiasLH4:
+    cbz x5, PostTreatLH4
     ld1 {v0.8h}, [x5], #16
-
-    fmla v8.8h, v0.8h, v5.h[1]
-    fmla v9.8h, v0.8h, v5.h[1]
-    fmla v10.8h, v0.8h, v5.h[1]
-    fmla v11.8h, v0.8h, v5.h[1]
-
-    fmla v12.8h, v0.8h, v5.h[1]
-    fmla v13.8h, v0.8h, v5.h[1]
-    fmla v14.8h, v0.8h, v5.h[1]
-    fmla v15.8h, v0.8h, v5.h[1]
-
-    fmla v16.8h, v0.8h, v5.h[1]
-    fmla v17.8h, v0.8h, v5.h[1]
-    fmla v18.8h, v0.8h, v5.h[1]
-    fmla v19.8h, v0.8h, v5.h[1]
+    FADD_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v0 
 
     PostTreatLH4:
-    fmax v8.8h, v8.8h, v6.8h
-    fmax v9.8h, v9.8h, v6.8h
-    fmax v10.8h, v10.8h, v6.8h
-    fmax v11.8h, v11.8h, v6.8h
-    fmax v12.8h, v12.8h, v6.8h
-    fmax v13.8h, v13.8h, v6.8h
-    fmax v14.8h, v14.8h, v6.8h
-    fmax v15.8h, v15.8h, v6.8h
-    fmax v16.8h, v16.8h, v6.8h
-    fmax v17.8h, v17.8h, v6.8h
-    fmax v18.8h, v18.8h, v6.8h
-    fmax v19.8h, v19.8h, v6.8h
-
-    fmin v8.8h,  v8.8h,  v7.8h
-    fmin v9.8h,  v9.8h,  v7.8h
-    fmin v10.8h, v10.8h, v7.8h
-    fmin v11.8h, v11.8h, v7.8h
-    fmin v12.8h, v12.8h, v7.8h
-    fmin v13.8h, v13.8h, v7.8h
-    fmin v14.8h, v14.8h, v7.8h
-    fmin v15.8h, v15.8h, v7.8h
-    fmin v16.8h, v16.8h, v7.8h
-    fmin v17.8h, v17.8h, v7.8h
-    fmin v18.8h, v18.8h, v7.8h
-    fmin v19.8h, v19.8h, v7.8h
+    FMAX_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v6
+    FMIN_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v7
 
     StoreLH4:
 
     st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64
     st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #64
     st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0]
-    sub x10, x10, #1
 
 
 End:
+ldp x21, x22, [sp, #80]
 ldp x19, x20, [sp, #64]
 ldp d8,  d9,  [sp, #48]
 ldp d10, d11, [sp, #32]
 ldp d12, d13, [sp, #16]
-ldp d14, d15, [sp], #80
+ldp d14, d15, [sp], #96
 
 ret
 
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S
index 0ca03121b..8f92ac238 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S
@@ -11,24 +11,106 @@
 
 .text
 .align 5
+
+.macro FMLA_8 d0, d1, d2, d3, d4, d5, d6, d7, s0, s1
+    fmla \d0\().8h, \s0\().8h, \s1\().h[0]
+    fmla \d1\().8h, \s0\().8h, \s1\().h[1]
+    fmla \d2\().8h, \s0\().8h, \s1\().h[2]
+    fmla \d3\().8h, \s0\().8h, \s1\().h[3]
+    fmla \d4\().8h, \s0\().8h, \s1\().h[4]
+    fmla \d5\().8h, \s0\().8h, \s1\().h[5]
+    fmla \d6\().8h, \s0\().8h, \s1\().h[6]
+    fmla \d7\().8h, \s0\().8h, \s1\().h[7]
+.endm
+
+.macro FMLA_4 d0, d1, d2, d3, s0, s1
+    fmla \d0\().8h, \s0\().8h, \s1\().h[0]
+    fmla \d1\().8h, \s0\().8h, \s1\().h[1]
+    fmla \d2\().8h, \s0\().8h, \s1\().h[2]
+    fmla \d3\().8h, \s0\().8h, \s1\().h[3]
+.endm
+
+.macro FMUL_8 d0, d1, d2, d3, d4, d5, d6, d7, s0, s1
+    fmul \d0\().8h, \s0\().8h, \s1\().h[0]
+    fmul \d1\().8h, \s0\().8h, \s1\().h[1]
+    fmul \d2\().8h, \s0\().8h, \s1\().h[2]
+    fmul \d3\().8h, \s0\().8h, \s1\().h[3]
+    fmul \d4\().8h, \s0\().8h, \s1\().h[4]
+    fmul \d5\().8h, \s0\().8h, \s1\().h[5]
+    fmul \d6\().8h, \s0\().8h, \s1\().h[6]
+    fmul \d7\().8h, \s0\().8h, \s1\().h[7]
+.endm
+
+.macro FMUL_4 d0, d1, d2, d3, s0, s1
+    fmul \d0\().8h, \s0\().8h, \s1\().h[0]
+    fmul \d1\().8h, \s0\().8h, \s1\().h[1]
+    fmul \d2\().8h, \s0\().8h, \s1\().h[2]
+    fmul \d3\().8h, \s0\().8h, \s1\().h[3]
+.endm
+
+.macro FADD_12 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, z0
+    fadd \d0\().8h, \d0\().8h, \z0\().8h
+    fadd \d1\().8h, \d1\().8h, \z0\().8h
+    fadd \d2\().8h, \d2\().8h, \z0\().8h
+    fadd \d3\().8h, \d3\().8h, \z0\().8h
+    fadd \d4\().8h, \d4\().8h, \z0\().8h
+    fadd \d5\().8h, \d5\().8h, \z0\().8h
+    fadd \d6\().8h, \d6\().8h, \z0\().8h
+    fadd \d7\().8h, \d7\().8h, \z0\().8h
+    fadd \d8\().8h, \d8\().8h, \z0\().8h
+    fadd \d9\().8h, \d9\().8h, \z0\().8h
+    fadd \d10\().8h, \d10\().8h, \z0\().8h
+    fadd \d11\().8h, \d11\().8h, \z0\().8h
+.endm
+
+.macro FMAX_12 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, z0
+    fmax \d0\().8h, \d0\().8h, \z0\().8h
+    fmax \d1\().8h, \d1\().8h, \z0\().8h
+    fmax \d2\().8h, \d2\().8h, \z0\().8h
+    fmax \d3\().8h, \d3\().8h, \z0\().8h
+    fmax \d4\().8h, \d4\().8h, \z0\().8h
+    fmax \d5\().8h, \d5\().8h, \z0\().8h
+    fmax \d6\().8h, \d6\().8h, \z0\().8h
+    fmax \d7\().8h, \d7\().8h, \z0\().8h
+    fmax \d8\().8h, \d8\().8h, \z0\().8h
+    fmax \d9\().8h, \d9\().8h, \z0\().8h
+    fmax \d10\().8h, \d10\().8h, \z0\().8h
+    fmax \d11\().8h, \d11\().8h, \z0\().8h
+.endm
+
+.macro FMIN_12 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, z0
+    fmin \d0\().8h, \d0\().8h, \z0\().8h
+    fmin \d1\().8h, \d1\().8h, \z0\().8h
+    fmin \d2\().8h, \d2\().8h, \z0\().8h
+    fmin \d3\().8h, \d3\().8h, \z0\().8h
+    fmin \d4\().8h, \d4\().8h, \z0\().8h
+    fmin \d5\().8h, \d5\().8h, \z0\().8h
+    fmin \d6\().8h, \d6\().8h, \z0\().8h
+    fmin \d7\().8h, \d7\().8h, \z0\().8h
+    fmin \d8\().8h, \d8\().8h, \z0\().8h
+    fmin \d9\().8h, \d9\().8h, \z0\().8h
+    fmin \d10\().8h, \d10\().8h, \z0\().8h
+    fmin \d11\().8h, \d11\().8h, \z0\().8h
+.endm
+
 // 8 * 24 MatMul
 asm_function MNNPackedMatMulFP16_int8
 //void MNNPackedMatMulFP16_int8(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, const size_t* parameter, const FLOAT16* postParameters, const FLOAT16* bias, const FLOAT16* k, const FLOAT16* b);
 // x0: C, x1:A, x2:B, x3:parameter, x4: postParameters, x5:bias, x6: quant_alpha, x7: quant_bias
-stp d14, d15, [sp, #-64]!
+stp d14, d15, [sp, #-96]!
 stp d12, d13, [sp, #16]
 stp d10, d11, [sp, #32]
 stp d8,  d9,  [sp, #48]
+stp x19, x20, [sp, #64]
+stp x21, x22, [sp, #80]
 
 ldr x9, [x3, #8] // l
 ldr x10, [x3, #16] // h
 
 ldr x13, [x3, #24] // cStride
 ldr x11, [x3, #40] // bExtraStride
-
-// v0, v1, v2: A
-// v3, v4: B
-// v8 - v31: C
+ldr x19, [x3, #48] // blockId
+mov x20, x0
 add x10, x10, #7
 lsr x10, x10, #3
 
@@ -41,6 +123,7 @@ LH8:
 sub x14, x13, #128
 LoopH:
     mov x15, x1
+    mov x22, x2
     ld1 {v4.8h, v5.8h}, [x6], #32 // alpha
     ld1 {v6.8h, v7.8h}, [x7], #32 // bias
     subs x12, x9, #1
@@ -54,35 +137,34 @@ LoopH:
     mov v1.8h, v6.8h
     fmla v1.8h, v0.8h, v4.8h
 
+    cbnz x19, LH8_BLOCK_GT_0
+
+    LH8_BLOCK0:
     ld1 {v0.8h}, [x15], #16
-    fmul v8.8h, v1.8h, v0.h[0]
-    fmul v9.8h, v1.8h, v0.h[1]
-    fmul v10.8h, v1.8h, v0.h[2]
-    fmul v11.8h, v1.8h, v0.h[3]
-    fmul v12.8h, v1.8h, v0.h[4]
-    fmul v13.8h, v1.8h, v0.h[5]
-    fmul v14.8h, v1.8h, v0.h[6]
-    fmul v15.8h, v1.8h, v0.h[7]
-
-    fmul v20.8h, v2.8h, v0.h[0]
-    fmul v21.8h, v2.8h, v0.h[1]
-    fmul v22.8h, v2.8h, v0.h[2]
-    fmul v23.8h, v2.8h, v0.h[3]
-    fmul v24.8h, v2.8h, v0.h[4]
-    fmul v25.8h, v2.8h, v0.h[5]
-    fmul v26.8h, v2.8h, v0.h[6]
-    fmul v27.8h, v2.8h, v0.h[7]
+    FMUL_8 v8, v9, v10, v11, v12, v13, v14, v15, v1, v0
+    FMUL_8 v20, v21, v22, v23, v24, v25, v26, v27, v2, v0
+    ld1 {v0.4h}, [x15], #8
+    FMUL_4 v16, v17, v18, v19, v1, v0
+    FMUL_4 v28, v29, v30, v31, v2, v0
+    b LH8_INIT_END
+
+    LH8_BLOCK_GT_0:
+    ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x20], #64
+    ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x20], #64
+    ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x20], x14
+
+    ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x20], #64
+    ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x20], #64
+    ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x20], x14
 
+    ld1 {v0.8h}, [x15], #16
+    FMLA_8 v8, v9, v10, v11, v12, v13, v14, v15, v1, v0
+    FMLA_8 v20, v21, v22, v23, v24, v25, v26, v27, v2, v0
     ld1 {v0.4h}, [x15], #8
-    fmul v16.8h, v1.8h, v0.h[0]
-    fmul v17.8h, v1.8h, v0.h[1]
-    fmul v18.8h, v1.8h, v0.h[2]
-    fmul v19.8h, v1.8h, v0.h[3]
-    fmul v28.8h, v2.8h, v0.h[0]
-    fmul v29.8h, v2.8h, v0.h[1]
-    fmul v30.8h, v2.8h, v0.h[2]
-    fmul v31.8h, v2.8h, v0.h[3]
+    FMLA_4 v16, v17, v18, v19, v1, v0
+    FMLA_4 v28, v29, v30, v31, v2, v0
 
+    LH8_INIT_END:
     beq LoopLEnd
 
     LoopL1:
@@ -98,129 +180,37 @@ LoopH:
         fmla v1.8h, v0.8h, v4.8h
 
         ld1 {v0.8h}, [x15], #16
-        fmla v8.8h, v1.8h, v0.h[0]
-        fmla v9.8h, v1.8h, v0.h[1]
-        fmla v10.8h, v1.8h, v0.h[2]
-        fmla v11.8h, v1.8h, v0.h[3]
-        fmla v12.8h, v1.8h, v0.h[4]
-        fmla v13.8h, v1.8h, v0.h[5]
-        fmla v14.8h, v1.8h, v0.h[6]
-        fmla v15.8h, v1.8h, v0.h[7]
-
-        fmla v20.8h, v2.8h, v0.h[0]
-        fmla v21.8h, v2.8h, v0.h[1]
-        fmla v22.8h, v2.8h, v0.h[2]
-        fmla v23.8h, v2.8h, v0.h[3]
-        fmla v24.8h, v2.8h, v0.h[4]
-        fmla v25.8h, v2.8h, v0.h[5]
-        fmla v26.8h, v2.8h, v0.h[6]
-        fmla v27.8h, v2.8h, v0.h[7]
-
+        FMLA_8 v8, v9, v10, v11, v12, v13, v14, v15, v1, v0
+        FMLA_8 v20, v21, v22, v23, v24, v25, v26, v27, v2, v0
         ld1 {v0.4h}, [x15], #8
-        fmla v16.8h, v1.8h, v0.h[0]
-        fmla v17.8h, v1.8h, v0.h[1]
-        fmla v18.8h, v1.8h, v0.h[2]
-        fmla v19.8h, v1.8h, v0.h[3]
-        fmla v28.8h, v2.8h, v0.h[0]
-        fmla v29.8h, v2.8h, v0.h[1]
-        fmla v30.8h, v2.8h, v0.h[2]
-        fmla v31.8h, v2.8h, v0.h[3]
-
+        FMLA_4 v16, v17, v18, v19, v1, v0
+        FMLA_4 v28, v29, v30, v31, v2, v0
         bne LoopL1
     
     LoopLEnd:
 
     add x2, x2, x11
+    sub x10, x10, #2
+    cmp x10, #2
 
-    cbz x4, StoreLH8
+    cbz x4, StoreLH8 // If postParameter* is nullptr, not the last blockId, just store the intemediate results.
 
-    AddBiasLH8:
-    ld1 {v5.8h}, [x4]
+    ld1 {v5.4s}, [x4]
     fcvtn v5.4h, v5.4s
     dup v6.8h, v5.h[2] // Min Value
     dup v7.8h, v5.h[3] // Max Value
-    ld1 {v0.8h, v1.8h}, [x5], #32
-
-    fmla v8.8h, v0.8h, v5.h[1]
-    fmla v9.8h, v0.8h, v5.h[1]
-    fmla v10.8h, v0.8h, v5.h[1]
-    fmla v11.8h, v0.8h, v5.h[1]
-
-    fmla v12.8h, v0.8h, v5.h[1]
-    fmla v13.8h, v0.8h, v5.h[1]
-    fmla v14.8h, v0.8h, v5.h[1]
-    fmla v15.8h, v0.8h, v5.h[1]
-
-    fmla v16.8h, v0.8h, v5.h[1]
-    fmla v17.8h, v0.8h, v5.h[1]
-    fmla v18.8h, v0.8h, v5.h[1]
-    fmla v19.8h, v0.8h, v5.h[1]
-
-    fmla v20.8h, v1.8h, v5.h[1]
-    fmla v21.8h, v1.8h, v5.h[1]
-    fmla v22.8h, v1.8h, v5.h[1]
-    fmla v23.8h, v1.8h, v5.h[1]
-
-    fmla v24.8h, v1.8h, v5.h[1]
-    fmla v25.8h, v1.8h, v5.h[1]
-    fmla v26.8h, v1.8h, v5.h[1]
-    fmla v27.8h, v1.8h, v5.h[1]
-
-    fmla v28.8h, v1.8h, v5.h[1]
-    fmla v29.8h, v1.8h, v5.h[1]
-    fmla v30.8h, v1.8h, v5.h[1]
-    fmla v31.8h, v1.8h, v5.h[1]
+
+    AddBiasLH8:
+    cbz x5, PostTreatLH8
+    ld1 {v0.8h, v1.8h}, [x5], #32 // gemm bias
+    FADD_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v0
+    FADD_12 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v1
 
     PostTreatLH8:
-    fmax v8.8h, v8.8h, v6.8h
-    fmax v9.8h, v9.8h, v6.8h
-    fmax v10.8h, v10.8h, v6.8h
-    fmax v11.8h, v11.8h, v6.8h
-    fmax v12.8h, v12.8h, v6.8h
-    fmax v13.8h, v13.8h, v6.8h
-    fmax v14.8h, v14.8h, v6.8h
-    fmax v15.8h, v15.8h, v6.8h
-    fmax v16.8h, v16.8h, v6.8h
-    fmax v17.8h, v17.8h, v6.8h
-    fmax v18.8h, v18.8h, v6.8h
-    fmax v19.8h, v19.8h, v6.8h
-    fmax v20.8h, v20.8h, v6.8h
-    fmax v21.8h, v21.8h, v6.8h
-    fmax v22.8h, v22.8h, v6.8h
-    fmax v23.8h, v23.8h, v6.8h
-    fmax v24.8h, v24.8h, v6.8h
-    fmax v25.8h, v25.8h, v6.8h
-    fmax v26.8h, v26.8h, v6.8h
-    fmax v27.8h, v27.8h, v6.8h
-    fmax v28.8h, v28.8h, v6.8h
-    fmax v29.8h, v29.8h, v6.8h
-    fmax v30.8h, v30.8h, v6.8h
-    fmax v31.8h, v31.8h, v6.8h
-
-    fmin v8.8h,  v8.8h,  v7.8h
-    fmin v9.8h,  v9.8h,  v7.8h
-    fmin v10.8h, v10.8h, v7.8h
-    fmin v11.8h, v11.8h, v7.8h
-    fmin v12.8h, v12.8h, v7.8h
-    fmin v13.8h, v13.8h, v7.8h
-    fmin v14.8h, v14.8h, v7.8h
-    fmin v15.8h, v15.8h, v7.8h
-    fmin v16.8h, v16.8h, v7.8h
-    fmin v17.8h, v17.8h, v7.8h
-    fmin v18.8h, v18.8h, v7.8h
-    fmin v19.8h, v19.8h, v7.8h
-    fmin v20.8h, v20.8h, v7.8h
-    fmin v21.8h, v21.8h, v7.8h
-    fmin v22.8h, v22.8h, v7.8h
-    fmin v23.8h, v23.8h, v7.8h
-    fmin v24.8h, v24.8h, v7.8h
-    fmin v25.8h, v25.8h, v7.8h
-    fmin v26.8h, v26.8h, v7.8h
-    fmin v27.8h, v27.8h, v7.8h
-    fmin v28.8h, v28.8h, v7.8h
-    fmin v29.8h, v29.8h, v7.8h
-    fmin v30.8h, v30.8h, v7.8h
-    fmin v31.8h, v31.8h, v7.8h
+    FMAX_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v6
+    FMAX_12 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v6
+    FMIN_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v7
+    FMIN_12 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v7
 
     StoreLH8:
 
@@ -228,12 +218,9 @@ LoopH:
     st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #64
     st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x14
 
-    sub x10, x10, #2
-
     st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
     st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64
     st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], x14
-    cmp x10, #2
 
     bge LoopH
 
@@ -252,6 +239,15 @@ LoopHRemain:
     fmla v3.8h, v6.8h, v20.8h
 
     ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24
+    cbnz x19, LH4_BLOCK_GT_0
+
+    LH4_BLOCK0:
+    
+    FMUL_4 v8, v9, v10, v11, v3, v0
+    FMUL_4 v12, v13, v14, v15, v3, v1
+    FMUL_4 v16, v17, v18, v19, v3, v2
+    b LH4_INIT_END
+    /* 
     fmul v8.8h, v3.8h, v0.h[0]
     fmul v9.8h, v3.8h, v0.h[1]
     fmul v10.8h, v3.8h, v0.h[2]
@@ -264,7 +260,17 @@ LoopHRemain:
     fmul v17.8h, v3.8h, v2.h[1]
     fmul v18.8h, v3.8h, v2.h[2]
     fmul v19.8h, v3.8h, v2.h[3]
+    */
+
+    LH4_BLOCK_GT_0:
+    ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x20], #64
+    ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x20], #64
+    ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x20]
+    FMLA_4 v8, v9, v10, v11, v3, v0
+    FMLA_4 v12, v13, v14, v15, v3, v1
+    FMLA_4 v16, v17, v18, v19, v3, v2
 
+    LH4_INIT_END:
     beq LoopLREnd
 
     LoopLR:
@@ -276,71 +282,26 @@ LoopHRemain:
         fmla v3.8h, v6.8h, v20.8h
 
         ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24
-        fmla v8.8h, v3.8h, v0.h[0]
-        fmla v9.8h, v3.8h, v0.h[1]
-        fmla v10.8h, v3.8h, v0.h[2]
-        fmla v11.8h, v3.8h, v0.h[3]
-        fmla v12.8h, v3.8h, v1.h[0]
-        fmla v13.8h, v3.8h, v1.h[1]
-        fmla v14.8h, v3.8h, v1.h[2]
-        fmla v15.8h, v3.8h, v1.h[3]
-        fmla v16.8h, v3.8h, v2.h[0]
-        fmla v17.8h, v3.8h, v2.h[1]
-        fmla v18.8h, v3.8h, v2.h[2]
-        fmla v19.8h, v3.8h, v2.h[3]
-
+        FMLA_4 v8, v9, v10, v11, v3, v0
+        FMLA_4 v12, v13, v14, v15, v3, v1
+        FMLA_4 v16, v17, v18, v19, v3, v2
         bne LoopLR
     LoopLREnd:
 
     cbz x4, StoreLH4
-    AddBiasLH4:
-    ld1 {v5.8h}, [x4]
+    
+    ld1 {v5.4s}, [x4]
     fcvtn v5.4h, v5.4s
     dup v6.8h, v5.h[2] // Min Value
     dup v7.8h, v5.h[3] // Max Value
+    AddBiasLH4:
+    cbz x5, PostTreatLH4
     ld1 {v0.8h}, [x5], #16
-
-    fmla v8.8h, v0.8h, v5.h[1]
-    fmla v9.8h, v0.8h, v5.h[1]
-    fmla v10.8h, v0.8h, v5.h[1]
-    fmla v11.8h, v0.8h, v5.h[1]
-
-    fmla v12.8h, v0.8h, v5.h[1]
-    fmla v13.8h, v0.8h, v5.h[1]
-    fmla v14.8h, v0.8h, v5.h[1]
-    fmla v15.8h, v0.8h, v5.h[1]
-
-    fmla v16.8h, v0.8h, v5.h[1]
-    fmla v17.8h, v0.8h, v5.h[1]
-    fmla v18.8h, v0.8h, v5.h[1]
-    fmla v19.8h, v0.8h, v5.h[1]
+    FADD_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v0 
 
     PostTreatLH4:
-    fmax v8.8h, v8.8h, v6.8h
-    fmax v9.8h, v9.8h, v6.8h
-    fmax v10.8h, v10.8h, v6.8h
-    fmax v11.8h, v11.8h, v6.8h
-    fmax v12.8h, v12.8h, v6.8h
-    fmax v13.8h, v13.8h, v6.8h
-    fmax v14.8h, v14.8h, v6.8h
-    fmax v15.8h, v15.8h, v6.8h
-    fmax v16.8h, v16.8h, v6.8h
-    fmax v17.8h, v17.8h, v6.8h
-    fmax v18.8h, v18.8h, v6.8h
-    fmax v19.8h, v19.8h, v6.8h
-
-    fmin v8.8h,  v8.8h,  v7.8h
-    fmin v9.8h,  v9.8h,  v7.8h
-    fmin v10.8h, v10.8h, v7.8h
-    fmin v11.8h, v11.8h, v7.8h
-    fmin v12.8h, v12.8h, v7.8h
-    fmin v13.8h, v13.8h, v7.8h
-    fmin v14.8h, v14.8h, v7.8h
-    fmin v15.8h, v15.8h, v7.8h
-    fmin v16.8h, v16.8h, v7.8h
-    fmin v17.8h, v17.8h, v7.8h
-    fmin v18.8h, v18.8h, v7.8h
-    fmin v19.8h, v19.8h, v7.8h
+    FMAX_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v6
+    FMIN_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v7
 
     StoreLH4:
 
@@ -350,10 +311,12 @@ LoopHRemain:
 
 
 End:
+ldp x21, x22, [sp, #80]
+ldp x19, x20, [sp, #64]
 ldp d8,  d9,  [sp, #48]
 ldp d10, d11, [sp, #32]
 ldp d12, d13, [sp, #16]
-ldp d14, d15, [sp], #64
+ldp d14, d15, [sp], #96
 
 ret
 
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S
index 0c93af680..3949f7414 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S
@@ -39,10 +39,10 @@ ldr x10, [x4, #16] // h
 
 ldr x7, [x4, #24] // cStride
 ldr x19, [x4, #40] // bExtraStride
+ldr x26, [x4, #48] // blockId
 
 add x10, x10, #7
 lsr x10, x10, #3
-
 cbz x5, Start
 ld1 {v5.4s}, [x5]
 fcvtn v5.4h, v5.4s
@@ -63,6 +63,7 @@ LoopE8:
     mov x13, x2
     mov x14, x22
     mov x25, x23
+    
 
     LH8:
     cmp x8, #2
@@ -90,6 +91,8 @@ LoopE8:
         fmla v4.8h, v2.8h, v13.8h
 
         ld1 {v0.8h}, [x15], x11
+        cbnz x26, LE8H8_BLOCK_GT_0
+
         fmul v16.8h, v3.8h, v0.h[0]
         fmul v17.8h, v3.8h, v0.h[1]
         fmul v18.8h, v3.8h, v0.h[2]
@@ -109,6 +112,38 @@ LoopE8:
         fmul v29.8h, v4.8h, v0.h[5]
         fmul v30.8h, v4.8h, v0.h[6]
         fmul v31.8h, v4.8h, v0.h[7]
+        b LE8H8_INIT_END
+
+        LE8H8_BLOCK_GT_0:
+        ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+        ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x24
+
+        ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0]
+        sub x0, x0, #128
+        sub x0, x0, x24
+
+        fmla v16.8h, v3.8h, v0.h[0]
+        fmla v17.8h, v3.8h, v0.h[1]
+        fmla v18.8h, v3.8h, v0.h[2]
+        fmla v19.8h, v3.8h, v0.h[3]
+
+        fmla v20.8h, v4.8h, v0.h[0]
+        fmla v21.8h, v4.8h, v0.h[1]
+        fmla v22.8h, v4.8h, v0.h[2]
+        fmla v23.8h, v4.8h, v0.h[3]
+
+        fmla v24.8h, v3.8h, v0.h[4]
+        fmla v25.8h, v3.8h, v0.h[5]
+        fmla v26.8h, v3.8h, v0.h[6]
+        fmla v27.8h, v3.8h, v0.h[7]
+
+        fmla v28.8h, v4.8h, v0.h[4]
+        fmla v29.8h, v4.8h, v0.h[5]
+        fmla v30.8h, v4.8h, v0.h[6]
+        fmla v31.8h, v4.8h, v0.h[7]
+
+        LE8H8_INIT_END:
         beq LoopLEnd
 
         LoopL:
@@ -156,30 +191,30 @@ LoopE8:
 
         add x13, x13, x19
         sub x8, x8, #2
-
         cbz x5, StoreLH8
         AddBiasLH8:
+        cbz x20, PostTreatLH8
         ld1 {v0.8h, v1.8h}, [x20], #32
 
-        fmla v16.8h, v0.8h, v5.h[1]
-        fmla v17.8h, v0.8h, v5.h[1]
-        fmla v18.8h, v0.8h, v5.h[1]
-        fmla v19.8h, v0.8h, v5.h[1]
+        fadd v16.8h, v0.8h, v16.8h
+        fadd v17.8h, v0.8h, v17.8h
+        fadd v18.8h, v0.8h, v18.8h
+        fadd v19.8h, v0.8h, v19.8h
 
-        fmla v20.8h, v1.8h, v5.h[1]
-        fmla v21.8h, v1.8h, v5.h[1]
-        fmla v22.8h, v1.8h, v5.h[1]
-        fmla v23.8h, v1.8h, v5.h[1]
+        fadd v20.8h, v1.8h, v20.8h
+        fadd v21.8h, v1.8h, v21.8h
+        fadd v22.8h, v1.8h, v22.8h
+        fadd v23.8h, v1.8h, v23.8h
 
-        fmla v24.8h, v0.8h, v5.h[1]
-        fmla v25.8h, v0.8h, v5.h[1]
-        fmla v26.8h, v0.8h, v5.h[1]
-        fmla v27.8h, v0.8h, v5.h[1]
+        fadd v24.8h, v0.8h, v24.8h
+        fadd v25.8h, v0.8h, v25.8h
+        fadd v26.8h, v0.8h, v26.8h
+        fadd v27.8h, v0.8h, v27.8h
 
-        fmla v28.8h, v1.8h, v5.h[1]
-        fmla v29.8h, v1.8h, v5.h[1]
-        fmla v30.8h, v1.8h, v5.h[1]
-        fmla v31.8h, v1.8h, v5.h[1]
+        fadd v28.8h, v1.8h, v28.8h
+        fadd v29.8h, v1.8h, v29.8h
+        fadd v30.8h, v1.8h, v30.8h
+        fadd v31.8h, v1.8h, v31.8h
 
         PostTreatLH8:
         fmax v16.8h, v16.8h, v6.8h
@@ -245,6 +280,8 @@ LoopE8:
         fmla v3.8h, v1.8h, v12.8h
 
         ld1 {v0.8h}, [x15], x11
+        cbnz x26, LE8H4_BLOCK_GT_0
+
         fmul v16.8h, v3.8h, v0.h[0]
         fmul v17.8h, v3.8h, v0.h[1]
         fmul v18.8h, v3.8h, v0.h[2]
@@ -253,6 +290,22 @@ LoopE8:
         fmul v21.8h, v3.8h, v0.h[5]
         fmul v22.8h, v3.8h, v0.h[6]
         fmul v23.8h, v3.8h, v0.h[7]
+        b LE8H4_INIT_END
+
+        LE8H4_BLOCK_GT_0:
+        ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+        ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0]
+        fmla v16.8h, v3.8h, v0.h[0]
+        fmla v17.8h, v3.8h, v0.h[1]
+        fmla v18.8h, v3.8h, v0.h[2]
+        fmla v19.8h, v3.8h, v0.h[3]
+        sub x0, x0, #64
+
+        fmla v20.8h, v3.8h, v0.h[4]
+        fmla v21.8h, v3.8h, v0.h[5]
+        fmla v22.8h, v3.8h, v0.h[6]
+        fmla v23.8h, v3.8h, v0.h[7]
+        LE8H4_INIT_END:
         beq LoopLREnd
 
         LoopLR:
@@ -283,17 +336,18 @@ LoopE8:
 
         cbz x5, StoreLH8x4
         AddBiasLH8x4:
+        cbz x20, PostTreatLH8x4
         ld1 {v0.8h}, [x20]
 
-        fmla v16.8h, v0.8h, v5.h[1]
-        fmla v17.8h, v0.8h, v5.h[1]
-        fmla v18.8h, v0.8h, v5.h[1]
-        fmla v19.8h, v0.8h, v5.h[1]
+        fadd v16.8h, v16.8h, v0.8h
+        fadd v17.8h, v17.8h, v0.8h
+        fadd v18.8h, v18.8h, v0.8h
+        fadd v19.8h, v19.8h, v0.8h
 
-        fmla v20.8h, v0.8h, v5.h[1]
-        fmla v21.8h, v0.8h, v5.h[1]
-        fmla v22.8h, v0.8h, v5.h[1]
-        fmla v23.8h, v0.8h, v5.h[1]
+        fadd v20.8h, v20.8h, v0.8h
+        fadd v21.8h, v21.8h, v0.8h
+        fadd v22.8h, v22.8h, v0.8h
+        fadd v23.8h, v23.8h, v0.8h
         
         PostTreatLH8x4:
         fmax v16.8h, v16.8h, v6.8h
@@ -362,6 +416,8 @@ blt E1
         fmla v4.8h, v2.8h, v13.8h
 
         ld1 {v0.4h}, [x15], x11
+        cbnz x26, LE4H8_BLOCK_GT_0
+
         fmul v16.8h, v3.8h, v0.h[0]
         fmul v17.8h, v3.8h, v0.h[1]
         fmul v18.8h, v3.8h, v0.h[2]
@@ -371,7 +427,23 @@ blt E1
         fmul v21.8h, v4.8h, v0.h[1]
         fmul v22.8h, v4.8h, v0.h[2]
         fmul v23.8h, v4.8h, v0.h[3]
+        b LE4H8_INIT_END
 
+        LE4H8_BLOCK_GT_0:
+        ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x7
+        ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0]
+        fmla v16.8h, v3.8h, v0.h[0]
+        fmla v17.8h, v3.8h, v0.h[1]
+        fmla v18.8h, v3.8h, v0.h[2]
+        fmla v19.8h, v3.8h, v0.h[3]
+        sub x0, x0, x7
+
+        fmla v20.8h, v4.8h, v0.h[0]
+        fmla v21.8h, v4.8h, v0.h[1]
+        fmla v22.8h, v4.8h, v0.h[2]
+        fmla v23.8h, v4.8h, v0.h[3]
+
+        LE4H8_INIT_END:
         beq E4LoopLEnd
 
         E4LoopL:
@@ -415,17 +487,18 @@ blt E1
         cbz x5, StoreLH4x8
 
         AddBiasLH4x8:
+        cbz x20, PostTreatLH4x8
         ld1 {v0.8h, v1.8h}, [x20], #32
 
-        fmla v16.8h, v0.8h, v5.h[1]
-        fmla v17.8h, v0.8h, v5.h[1]
-        fmla v18.8h, v0.8h, v5.h[1]
-        fmla v19.8h, v0.8h, v5.h[1]
+        fadd v16.8h, v0.8h, v16.8h
+        fadd v17.8h, v0.8h, v17.8h
+        fadd v18.8h, v0.8h, v18.8h
+        fadd v19.8h, v0.8h, v19.8h
 
-        fmla v20.8h, v1.8h, v5.h[1]
-        fmla v21.8h, v1.8h, v5.h[1]
-        fmla v22.8h, v1.8h, v5.h[1]
-        fmla v23.8h, v1.8h, v5.h[1]
+        fadd v20.8h, v1.8h, v20.8h
+        fadd v21.8h, v1.8h, v21.8h
+        fadd v22.8h, v1.8h, v22.8h
+        fadd v23.8h, v1.8h, v23.8h
         
         PostTreatLH4x8:
         fmax v16.8h, v16.8h, v6.8h
@@ -472,11 +545,21 @@ blt E1
     fmla v3.8h, v1.8h, v12.8h
 
     ld1 {v0.4h}, [x15], x11
+    cbnz x26, LE4H4_BLOCK_GT_0
     fmul v16.8h, v3.8h, v0.h[0]
     fmul v17.8h, v3.8h, v0.h[1]
     fmul v18.8h, v3.8h, v0.h[2]
     fmul v19.8h, v3.8h, v0.h[3]
+    b LE4H4_INIT_END
+
+    LE4H4_BLOCK_GT_0:
+    ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0]
+    fmla v16.8h, v3.8h, v0.h[0]
+    fmla v17.8h, v3.8h, v0.h[1]
+    fmla v18.8h, v3.8h, v0.h[2]
+    fmla v19.8h, v3.8h, v0.h[3]
 
+    LE4H4_INIT_END:
     beq E4LoopLREnd
 
     E4LoopLR:
@@ -502,12 +585,13 @@ blt E1
 
     cbz x5, StoreLH4x4
     AddBiasLH4x4:
+    cbz x20, PostTreatLH4x4
     ld1 {v0.8h}, [x20]
 
-    fmla v16.8h, v0.8h, v5.h[1]
-    fmla v17.8h, v0.8h, v5.h[1]
-    fmla v18.8h, v0.8h, v5.h[1]
-    fmla v19.8h, v0.8h, v5.h[1]
+    fadd v16.8h, v16.8h, v0.8h
+    fadd v17.8h, v17.8h, v0.8h
+    fadd v18.8h, v18.8h, v0.8h
+    fadd v19.8h, v19.8h, v0.8h
 
     
     PostTreatLH4x4:
@@ -569,9 +653,19 @@ LoopE1:
         fmla v4.8h, v2.8h, v13.8h
 
         ld1 {v0.h}[0], [x15], x11
+        cbnz x26, LE1H8_BLOCK_GT_0
         fmul v16.8h, v3.8h, v0.h[0]
         fmul v20.8h, v4.8h, v0.h[0]
+        b LE1H8_INIT_END
+
+        LE1H8_BLOCK_GT_0:
+        ld1 {v16.8h}, [x0], x7
+        ld1 {v20.8h}, [x0]
+        sub x0, x0, x7
+        fmla v16.8h, v3.8h, v0.h[0]
+        fmla v20.8h, v4.8h, v0.h[0]
 
+        LE1H8_INIT_END:
         beq E1LoopLEnd
 
         E1LoopL:
@@ -606,10 +700,11 @@ LoopE1:
 
         cbz x5, StoreLH1x8
         AddBiasLH1x8:
+        cbz x20, PostTreatLH1x8
         ld1 {v0.8h, v1.8h}, [x20], #32
 
-        fmla v16.8h, v0.8h, v5.h[1]
-        fmla v20.8h, v1.8h, v5.h[1]
+        fadd v16.8h, v0.8h, v16.8h
+        fadd v20.8h, v1.8h, v20.8h
         
         PostTreatLH1x8:
         fmax v16.8h, v16.8h, v6.8h
@@ -641,10 +736,15 @@ LoopE1:
     scvtf v1.8h, v1.8h
     mov v3.8h, v14.8h
     fmla v3.8h, v1.8h, v12.8h
-
     ld1 {v0.h}[0], [x15], x11
+    cbnz x26, LE1H4_BLOCK_GT_0
     fmul v16.8h, v3.8h, v0.h[0]
+    b LE1H4_INIT_END
 
+    LE1H4_BLOCK_GT_0:
+    ld1 {v16.8h}, [x0]
+    fmla v16.8h, v3.8h, v0.h[0]
+    LE1H4_INIT_END:
     beq E1LoopLREnd
 
     E1LoopLR:
@@ -667,6 +767,7 @@ LoopE1:
 
     cbz x5, StoreLH1x4
     AddBiasLH1x4:
+    cbz x20, PostTreatLH1x4
     ld1 {v0.8h}, [x20]
     fmla v16.8h, v0.8h, v5.h[1]
     
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S
index 0760af42b..f73046ec0 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S
@@ -36,10 +36,10 @@ ldr x10, [x4, #16] // h
 
 ldr x7, [x4, #24] // cStride
 ldr x19, [x4, #40] // bExtraStride
+ldr x26, [x4, #48] // blockId
 
 add x10, x10, #7
 lsr x10, x10, #3
-
 cbz x5, Start
 ld1 {v5.4s}, [x5]
 fcvtn v5.4h, v5.4s
@@ -81,6 +81,8 @@ LoopE8:
         fmla v4.8h, v2.8h, v13.8h
 
         ld1 {v0.8h}, [x15], x11
+        cbnz x26, LE8H8_BLOCK_GT_0
+
         fmul v16.8h, v3.8h, v0.h[0]
         fmul v17.8h, v3.8h, v0.h[1]
         fmul v18.8h, v3.8h, v0.h[2]
@@ -100,6 +102,35 @@ LoopE8:
         fmul v29.8h, v4.8h, v0.h[5]
         fmul v30.8h, v4.8h, v0.h[6]
         fmul v31.8h, v4.8h, v0.h[7]
+        b LE8H8_INIT_END
+        LE8H8_BLOCK_GT_0:
+        ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+        ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x24
+
+        ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0]
+        fmla v16.8h, v3.8h, v0.h[0]
+        fmla v17.8h, v3.8h, v0.h[1]
+        fmla v18.8h, v3.8h, v0.h[2]
+        fmla v19.8h, v3.8h, v0.h[3]
+        sub x0, x0, #128
+        sub x0, x0, x24
+
+        fmla v20.8h, v4.8h, v0.h[0]
+        fmla v21.8h, v4.8h, v0.h[1]
+        fmla v22.8h, v4.8h, v0.h[2]
+        fmla v23.8h, v4.8h, v0.h[3]
+
+        fmla v24.8h, v3.8h, v0.h[4]
+        fmla v25.8h, v3.8h, v0.h[5]
+        fmla v26.8h, v3.8h, v0.h[6]
+        fmla v27.8h, v3.8h, v0.h[7]
+
+        fmla v28.8h, v4.8h, v0.h[4]
+        fmla v29.8h, v4.8h, v0.h[5]
+        fmla v30.8h, v4.8h, v0.h[6]
+        fmla v31.8h, v4.8h, v0.h[7]
+        LE8H8_INIT_END:
         beq LoopLEnd
 
         LoopL:
@@ -141,30 +172,30 @@ LoopE8:
 
         add x13, x13, x19
         sub x8, x8, #2
-
         cbz x5, StoreLH8
         AddBiasLH8:
+        cbz x20, PostTreatLH8
         ld1 {v0.8h, v1.8h}, [x20], #32
 
-        fmla v16.8h, v0.8h, v5.h[1]
-        fmla v17.8h, v0.8h, v5.h[1]
-        fmla v18.8h, v0.8h, v5.h[1]
-        fmla v19.8h, v0.8h, v5.h[1]
+        fadd v16.8h, v0.8h, v16.8h
+        fadd v17.8h, v0.8h, v17.8h
+        fadd v18.8h, v0.8h, v18.8h
+        fadd v19.8h, v0.8h, v19.8h
 
-        fmla v20.8h, v1.8h, v5.h[1]
-        fmla v21.8h, v1.8h, v5.h[1]
-        fmla v22.8h, v1.8h, v5.h[1]
-        fmla v23.8h, v1.8h, v5.h[1]
+        fadd v20.8h, v1.8h, v20.8h
+        fadd v21.8h, v1.8h, v21.8h
+        fadd v22.8h, v1.8h, v22.8h
+        fadd v23.8h, v1.8h, v23.8h
 
-        fmla v24.8h, v0.8h, v5.h[1]
-        fmla v25.8h, v0.8h, v5.h[1]
-        fmla v26.8h, v0.8h, v5.h[1]
-        fmla v27.8h, v0.8h, v5.h[1]
+        fadd v24.8h, v0.8h, v24.8h
+        fadd v25.8h, v0.8h, v25.8h
+        fadd v26.8h, v0.8h, v26.8h
+        fadd v27.8h, v0.8h, v27.8h
 
-        fmla v28.8h, v1.8h, v5.h[1]
-        fmla v29.8h, v1.8h, v5.h[1]
-        fmla v30.8h, v1.8h, v5.h[1]
-        fmla v31.8h, v1.8h, v5.h[1]
+        fadd v28.8h, v1.8h, v28.8h
+        fadd v29.8h, v1.8h, v29.8h
+        fadd v30.8h, v1.8h, v30.8h
+        fadd v31.8h, v1.8h, v31.8h
 
         PostTreatLH8:
         fmax v16.8h, v16.8h, v6.8h
@@ -226,6 +257,8 @@ LoopE8:
         fmla v3.8h, v1.8h, v12.8h
 
         ld1 {v0.8h}, [x15], x11
+        cbnz x26, LE8H4_BLOCK_GT_0
+
         fmul v16.8h, v3.8h, v0.h[0]
         fmul v17.8h, v3.8h, v0.h[1]
         fmul v18.8h, v3.8h, v0.h[2]
@@ -234,6 +267,22 @@ LoopE8:
         fmul v21.8h, v3.8h, v0.h[5]
         fmul v22.8h, v3.8h, v0.h[6]
         fmul v23.8h, v3.8h, v0.h[7]
+        b LE8H4_INIT_END
+
+        LE8H4_BLOCK_GT_0:
+        ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+        ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0]
+        fmla v16.8h, v3.8h, v0.h[0]
+        fmla v17.8h, v3.8h, v0.h[1]
+        fmla v18.8h, v3.8h, v0.h[2]
+        fmla v19.8h, v3.8h, v0.h[3]
+        sub x0, x0, #64
+
+        fmla v20.8h, v3.8h, v0.h[4]
+        fmla v21.8h, v3.8h, v0.h[5]
+        fmla v22.8h, v3.8h, v0.h[6]
+        fmla v23.8h, v3.8h, v0.h[7]
+        LE8H4_INIT_END:
         beq LoopLREnd
 
         LoopLR:
@@ -260,17 +309,18 @@ LoopE8:
 
         cbz x5, StoreLH8x4
         AddBiasLH8x4:
+        cbz x20, PostTreatLH8x4
         ld1 {v0.8h}, [x20]
 
-        fmla v16.8h, v0.8h, v5.h[1]
-        fmla v17.8h, v0.8h, v5.h[1]
-        fmla v18.8h, v0.8h, v5.h[1]
-        fmla v19.8h, v0.8h, v5.h[1]
+        fadd v16.8h, v16.8h, v0.8h
+        fadd v17.8h, v17.8h, v0.8h
+        fadd v18.8h, v18.8h, v0.8h
+        fadd v19.8h, v19.8h, v0.8h
 
-        fmla v20.8h, v0.8h, v5.h[1]
-        fmla v21.8h, v0.8h, v5.h[1]
-        fmla v22.8h, v0.8h, v5.h[1]
-        fmla v23.8h, v0.8h, v5.h[1]
+        fadd v20.8h, v20.8h, v0.8h
+        fadd v21.8h, v21.8h, v0.8h
+        fadd v22.8h, v22.8h, v0.8h
+        fadd v23.8h, v23.8h, v0.8h
         
         PostTreatLH8x4:
         fmax v16.8h, v16.8h, v6.8h
@@ -333,6 +383,8 @@ blt E1
         fmla v4.8h, v2.8h, v13.8h
 
         ld1 {v0.4h}, [x15], x11
+        cbnz x26, LE4H8_BLOCK_GT_0
+
         fmul v16.8h, v3.8h, v0.h[0]
         fmul v17.8h, v3.8h, v0.h[1]
         fmul v18.8h, v3.8h, v0.h[2]
@@ -342,7 +394,23 @@ blt E1
         fmul v21.8h, v4.8h, v0.h[1]
         fmul v22.8h, v4.8h, v0.h[2]
         fmul v23.8h, v4.8h, v0.h[3]
+        b LE4H8_INIT_END
 
+        LE4H8_BLOCK_GT_0:
+        ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x7
+        ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0]
+        fmla v16.8h, v3.8h, v0.h[0]
+        fmla v17.8h, v3.8h, v0.h[1]
+        fmla v18.8h, v3.8h, v0.h[2]
+        fmla v19.8h, v3.8h, v0.h[3]
+        sub x0, x0, x7
+
+        fmla v20.8h, v4.8h, v0.h[0]
+        fmla v21.8h, v4.8h, v0.h[1]
+        fmla v22.8h, v4.8h, v0.h[2]
+        fmla v23.8h, v4.8h, v0.h[3]
+
+        LE4H8_INIT_END:
         beq E4LoopLEnd
 
         E4LoopL:
@@ -379,17 +447,18 @@ blt E1
         cbz x5, StoreLH4x8
 
         AddBiasLH4x8:
+        cbz x20, PostTreatLH4x8
         ld1 {v0.8h, v1.8h}, [x20], #32
 
-        fmla v16.8h, v0.8h, v5.h[1]
-        fmla v17.8h, v0.8h, v5.h[1]
-        fmla v18.8h, v0.8h, v5.h[1]
-        fmla v19.8h, v0.8h, v5.h[1]
+        fadd v16.8h, v0.8h, v16.8h
+        fadd v17.8h, v0.8h, v17.8h
+        fadd v18.8h, v0.8h, v18.8h
+        fadd v19.8h, v0.8h, v19.8h
 
-        fmla v20.8h, v1.8h, v5.h[1]
-        fmla v21.8h, v1.8h, v5.h[1]
-        fmla v22.8h, v1.8h, v5.h[1]
-        fmla v23.8h, v1.8h, v5.h[1]
+        fadd v20.8h, v1.8h, v20.8h
+        fadd v21.8h, v1.8h, v21.8h
+        fadd v22.8h, v1.8h, v22.8h
+        fadd v23.8h, v1.8h, v23.8h
         
         PostTreatLH4x8:
         fmax v16.8h, v16.8h, v6.8h
@@ -432,11 +501,21 @@ blt E1
     fmla v3.8h, v1.8h, v12.8h
 
     ld1 {v0.4h}, [x15], x11
+    cbnz x26, LE4H4_BLOCK_GT_0
     fmul v16.8h, v3.8h, v0.h[0]
     fmul v17.8h, v3.8h, v0.h[1]
     fmul v18.8h, v3.8h, v0.h[2]
     fmul v19.8h, v3.8h, v0.h[3]
+    b LE4H4_INIT_END
+
+    LE4H4_BLOCK_GT_0:
+    ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0]
+    fmla v16.8h, v3.8h, v0.h[0]
+    fmla v17.8h, v3.8h, v0.h[1]
+    fmla v18.8h, v3.8h, v0.h[2]
+    fmla v19.8h, v3.8h, v0.h[3]
 
+    LE4H4_INIT_END:
     beq E4LoopLREnd
 
     E4LoopLR:
@@ -458,12 +537,13 @@ blt E1
 
     cbz x5, StoreLH4x4
     AddBiasLH4x4:
+    cbz x20, PostTreatLH4x4
     ld1 {v0.8h}, [x20]
 
-    fmla v16.8h, v0.8h, v5.h[1]
-    fmla v17.8h, v0.8h, v5.h[1]
-    fmla v18.8h, v0.8h, v5.h[1]
-    fmla v19.8h, v0.8h, v5.h[1]
+    fadd v16.8h, v16.8h, v0.8h
+    fadd v17.8h, v17.8h, v0.8h
+    fadd v18.8h, v18.8h, v0.8h
+    fadd v19.8h, v19.8h, v0.8h
 
     
     PostTreatLH4x4:
@@ -519,9 +599,19 @@ LoopE1:
         fmla v4.8h, v2.8h, v13.8h
 
         ld1 {v0.h}[0], [x15], x11
+        cbnz x26, LE1H8_BLOCK_GT_0
         fmul v16.8h, v3.8h, v0.h[0]
         fmul v20.8h, v4.8h, v0.h[0]
+        b LE1H8_INIT_END
+
+        LE1H8_BLOCK_GT_0:
+        ld1 {v16.8h}, [x0], x7
+        ld1 {v20.8h}, [x0]
+        sub x0, x0, x7
+        fmla v16.8h, v3.8h, v0.h[0]
+        fmla v20.8h, v4.8h, v0.h[0]
 
+        LE1H8_INIT_END:
         beq E1LoopLEnd
 
         E1LoopL:
@@ -550,10 +640,11 @@ LoopE1:
 
         cbz x5, StoreLH1x8
         AddBiasLH1x8:
+        cbz x20, PostTreatLH1x8
         ld1 {v0.8h, v1.8h}, [x20], #32
 
-        fmla v16.8h, v0.8h, v5.h[1]
-        fmla v20.8h, v1.8h, v5.h[1]
+        fadd v16.8h, v0.8h, v16.8h
+        fadd v20.8h, v1.8h, v20.8h
         
         PostTreatLH1x8:
         fmax v16.8h, v16.8h, v6.8h
@@ -582,8 +673,14 @@ LoopE1:
     fmla v3.8h, v1.8h, v12.8h
 
     ld1 {v0.h}[0], [x15], x11
+    cbnz x26, LE1H4_BLOCK_GT_0
     fmul v16.8h, v3.8h, v0.h[0]
+    b LE1H4_INIT_END
 
+    LE1H4_BLOCK_GT_0:
+    ld1 {v16.8h}, [x0]
+    fmla v16.8h, v3.8h, v0.h[0]
+    LE1H4_INIT_END:
     beq E1LoopLREnd
 
     E1LoopLR:
@@ -601,6 +698,7 @@ LoopE1:
 
     cbz x5, StoreLH1x4
     AddBiasLH1x4:
+    cbz x20, PostTreatLH1x4
     ld1 {v0.8h}, [x20]
     fmla v16.8h, v0.8h, v5.h[1]
     
diff --git a/source/backend/cpu/CMakeLists.txt b/source/backend/cpu/CMakeLists.txt
index 5c91ad5b6..22aeb1ef4 100644
--- a/source/backend/cpu/CMakeLists.txt
+++ b/source/backend/cpu/CMakeLists.txt
@@ -1,6 +1,4 @@
 # CPU
-option(MNN_SUPPORT_BF16 "Enable MNN's bf16 op" OFF)
-option(MNN_LOW_MEMORY "Build MNN support low memory for weight quant model." OFF)
 
 if(MNN_SUPPORT_RENDER)
 FILE(GLOB MNN_CPU_SRC ${CMAKE_CURRENT_LIST_DIR}/* ${CMAKE_CURRENT_LIST_DIR}/compute/* ${CMAKE_CURRENT_LIST_DIR}/render/*)
diff --git a/source/backend/cpu/CPUAttention.cpp b/source/backend/cpu/CPUAttention.cpp
index 4d7b0b894..c37d9a3f7 100644
--- a/source/backend/cpu/CPUAttention.cpp
+++ b/source/backend/cpu/CPUAttention.cpp
@@ -29,8 +29,8 @@ namespace MNN {
 
 template <typename T>
 static void prefill_pack(Tensor* query, Tensor* key, Tensor* value, char* query_ptr, char* key_ptr, char* value_ptr,
-                         int mMaxLength, int mNumHead, int mHeadDim, int mValueH,
-                         int eP, int hP, int query_e, int key_h, int seq_len, int h) {
+                         int mMaxLength, int mNumHead, int mKvNumHead, int mHeadDim, int mValueH,
+                         int eP, int hP, int query_e, int key_h, int seq_len, int h, int kv_h) {
     auto query_src = query->host<T>();
     auto key_src = key->host<T>();
     auto value_src = value->host<T>();
@@ -54,7 +54,7 @@ static void prefill_pack(Tensor* query, Tensor* key, Tensor* value, char* query_
             for (int k = 0; k < hP; k++) {
                 int s = i * hP + k;
                 if (s < seq_len) {
-                    key_dst[i * mHeadDim * hP + j * hP + k] = key_src[s * mNumHead * mHeadDim + h * mHeadDim + j];
+                    key_dst[i * mHeadDim * hP + j * hP + k] = key_src[s * mKvNumHead * mHeadDim + kv_h * mHeadDim + j];
                 }
             }
         }
@@ -65,7 +65,7 @@ static void prefill_pack(Tensor* query, Tensor* key, Tensor* value, char* query_
             for (int k = 0; k < hP; k++) {
                 int hd = i * hP + k;
                 if (hd < mHeadDim) {
-                    value_dst[i * mMaxLength * hP + j * hP + k] = value_src[j * mNumHead * mHeadDim + h * mHeadDim + hd];
+                    value_dst[i * mMaxLength * hP + j * hP + k] = value_src[j * mKvNumHead * mHeadDim + kv_h * mHeadDim + hd];
                 }
             }
         }
@@ -74,7 +74,7 @@ static void prefill_pack(Tensor* query, Tensor* key, Tensor* value, char* query_
 
 template <typename T>
 static void decode_pack(Tensor* query, Tensor* key, Tensor* value, char* query_ptr, char* key_ptr, char* value_ptr,
-                         int mMaxLength, int mPastLength, int mHeadDim, int mValueH, int eP, int hP, int h) {
+                         int mMaxLength, int mPastLength, int mHeadDim, int mValueH, int eP, int hP, int h, int kv_h) {
     auto query_src = query->host<T>();
     auto key_src = key->host<T>();
     auto value_src = value->host<T>();
@@ -88,12 +88,12 @@ static void decode_pack(Tensor* query, Tensor* key, Tensor* value, char* query_p
     int outside_offset = UP_DIV(mPastLength, hP);
     int inside_offset = mPastLength % hP;
     for (int i = 0; i < mHeadDim; i++) {
-        key_dst[(outside_offset - (inside_offset != 0)) * mHeadDim * hP + i * hP + inside_offset] = key_src[h * mHeadDim + i];
+        key_dst[(outside_offset - (inside_offset != 0)) * mHeadDim * hP + i * hP + inside_offset] = key_src[kv_h * mHeadDim + i];
     }
     // transpose value: [1, num_head, head_dim] -> numhead, [head_dim/hP, kv_seq_len, hP]
     for (int i = 0; i < mValueH; i++) {
         for (int j = 0; j < hP; j++) {
-            value_dst[i * mMaxLength * hP + mPastLength * hP + j] = value_src[h * mHeadDim + i * hP + j];
+            value_dst[i * mMaxLength * hP + mPastLength * hP + j] = value_src[kv_h * mHeadDim + i * hP + j];
         }
     }
 }
@@ -163,51 +163,50 @@ static void decode_softmax(float* mask_qk, float* softmax_qk, char* unpack_qk, c
     }
 }
 
-void CPUAttentionImpl::allocKVCache() {
-    if (!mKVCache || mPastLength < mMaxLength) {
+void CPUAttention::allocKVCache() {
+    if (!mKVCache || mResource->mPastLength < mResource->mMaxLength) {
         return;
     }
-    mMaxLength = mPastLength + mExpandChunk;
+    mResource->mMaxLength = mResource->mPastLength + mResource->mExpandChunk;
     // past_key: [1, numhead, headdim, maxlen] -> numhead, [headdim, maxlen] -> pack_b -> numhead, [maxlen/hP, head_dim, hP]
-    mPastKey.reset(Tensor::createDevice<float>({mNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP}));
+    mResource->mPastKey.reset(Tensor::createDevice<float>({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), mResource->mHeadDim, hP}));
     // past_value: [1, numhead, maxlen, headdim] -> numhead, [maxlen, headdim] -> pack_b -> numhead, [head_dim/hP, max_len, hP]
-    mPastValue.reset(Tensor::createDevice<float>({mNumHead, mValueH, mMaxLength, hP}));
-    backend()->onAcquireBuffer(mPastKey.get(), Backend::STATIC);
-    backend()->onAcquireBuffer(mPastValue.get(), Backend::STATIC);
+    mResource->mPastValue.reset(Tensor::createDevice<float>({mResource->mKvNumHead, mResource->mValueH, mResource->mMaxLength, hP}));
+    backend()->onAcquireBuffer(mResource->mPastKey.get(), Backend::STATIC);
+    backend()->onAcquireBuffer(mResource->mPastValue.get(), Backend::STATIC);
 }
 
-void CPUAttentionImpl::reallocKVCache() {
-    if (!mKVCache || mPastLength < mMaxLength) {
+void CPUAttention::reallocKVCache() {
+    if (!mKVCache || mResource->mPastLength < mResource->mMaxLength) {
         return;
     }
-    mMaxLength = mPastLength + mExpandChunk;
+    mResource->mMaxLength = mResource->mPastLength + mResource->mExpandChunk;
     // past_key: [1, numhead, headdim, maxlen] -> numhead, [headdim, maxlen] -> pack_b -> numhead, [maxlen/hP, head_dim, hP]
-    auto new_key = Tensor::createDevice<float>({mNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP});
+    auto new_key = Tensor::createDevice<float>({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), mResource->mHeadDim, hP});
     // past_value: [1, numhead, maxlen, headdim] -> numhead, [maxlen, headdim] -> pack_b -> numhead, [head_dim/hP, max_len, hP]
-    auto new_value = Tensor::createDevice<float>({mNumHead, mValueH, mMaxLength, hP});
+    auto new_value = Tensor::createDevice<float>({mResource->mKvNumHead, mResource->mValueH, mResource->mMaxLength, hP});
     backend()->onAcquireBuffer(new_key, Backend::STATIC);
     backend()->onAcquireBuffer(new_value, Backend::STATIC);
     // copy
-    for (int h = 0; h < mNumHead; h++) {
-        ::memset(new_key->host<char>() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * bytes, 0, UP_DIV(mMaxLength, hP) * mHeadDim * hP * bytes);
-        ::memset(new_value->host<char>() + h * mValueH * mMaxLength * hP * bytes, 0, mValueH * mMaxLength * hP * bytes);
-        ::memcpy(new_key->host<char>() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * bytes,
-                 mPastKey->host<char>() + h * UP_DIV(mPastLength, hP) * mHeadDim * hP * bytes,
-                 UP_DIV(mPastLength, hP) * mHeadDim * hP * bytes);
-        for (int i = 0; i < mValueH; i++) {
-            ::memcpy(new_value->host<char>() + (h * mValueH + i) * mMaxLength * hP * bytes,
-                     mPastValue->host<char>() + (h * mValueH + i) * mPastLength * hP * bytes,
-                     mPastLength * hP * bytes);
+    for (int h = 0; h < mResource->mKvNumHead; h++) {
+        ::memset(new_key->host<char>() + h * UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP * bytes, 0, UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP * bytes);
+        ::memset(new_value->host<char>() + h * mResource->mValueH * mResource->mMaxLength * hP * bytes, 0, mResource->mValueH * mResource->mMaxLength * hP * bytes);
+        ::memcpy(new_key->host<char>() + h * UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP * bytes,
+                 mResource->mPastKey->host<char>() + h * UP_DIV(mResource->mPastLength, hP) * mResource->mHeadDim * hP * bytes,
+                 UP_DIV(mResource->mPastLength, hP) * mResource->mHeadDim * hP * bytes);
+        for (int i = 0; i < mResource->mValueH; i++) {
+            ::memcpy(new_value->host<char>() + (h * mResource->mValueH + i) * mResource->mMaxLength * hP * bytes,
+                     mResource->mPastValue->host<char>() + (h * mResource->mValueH + i) * mResource->mPastLength * hP * bytes,
+                     mResource->mPastLength * hP * bytes);
         }
     }
-    mPastKey.reset(new_key);
-    mPastValue.reset(new_value);
-    mTempQK.reset(Tensor::createDevice<float>({mThreadNum, eP + 2, mMaxLength}));
+    mResource->mPastKey.reset(new_key);
+    mResource->mPastValue.reset(new_value);
+    mTempQK.reset(Tensor::createDevice<float>({mThreadNum, eP + 2, mResource->mMaxLength}));
     backend()->onAcquireBuffer(mTempQK.get(), Backend::STATIC);
 }
 
-ErrorCode CPUAttentionImpl::onResize(Backend* _backend, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    mBackend = _backend;
+ErrorCode CPUAttention::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     auto core = static_cast<CPUBackend *>(backend())->functions();
     int unit  =  core->pack;
     bytes = core->bytes;
@@ -221,26 +220,27 @@ ErrorCode CPUAttentionImpl::onResize(Backend* _backend, const std::vector<Tensor
     int seq_len = shape[1];
     mThreadNum = ((CPUBackend *)backend())->threadNumber();
     mIsDecode = seq_len == 1;
-    if (mPastLength == 0 || seq_len > 1) {
-        mPastLength = seq_len;
+    if (mResource->mPastLength == 0 || seq_len > 1) {
+        mResource->mPastLength = seq_len;
     }
-    mNumHead = shape[2];
-    mHeadDim = shape[3];
-    mScale = 1.0 / sqrt(mHeadDim);
-    mValueH = UP_DIV(mHeadDim, hP);
+    mResource->mNumHead = shape[2];
+    mResource->mKvNumHead = key->shape()[2];
+    mResource->mHeadDim = shape[3];
+    mResource->mScale = 1.0 / sqrt(mResource->mHeadDim);
+    mResource->mValueH = UP_DIV(mResource->mHeadDim, hP);
     int query_e = UP_DIV(seq_len, eP);
     int key_h = UP_DIV(seq_len, hP);
     // mPastLength = 10;
     // alloc kv cache
     allocKVCache();
 
-    int tileCount = UP_DIV(mNumHead, mThreadNum);
+    int tileCount = UP_DIV(mResource->mNumHead, mThreadNum);
 
     // temp_query
-    mPackQ.reset(Tensor::createDevice<float>({mThreadNum, query_e, mHeadDim, eP}));
-    mPackQKV.reset(Tensor::createDevice<float>({mThreadNum, UP_DIV(mHeadDim, unit), seq_len, unit}));
+    mPackQ.reset(Tensor::createDevice<float>({mThreadNum, query_e, mResource->mHeadDim, eP}));
+    mPackQKV.reset(Tensor::createDevice<float>({mThreadNum, UP_DIV(mResource->mHeadDim, unit), seq_len, unit}));
     if (mIsDecode) {
-        mTempQK.reset(Tensor::createDevice<float>({mThreadNum, eP + 2, mMaxLength}));
+        mTempQK.reset(Tensor::createDevice<float>({mThreadNum, eP + 2, mResource->mMaxLength}));
         backend()->onAcquireBuffer(mTempQK.get(), Backend::DYNAMIC);
     } else {
         mTempQK.reset(Tensor::createDevice<float>({mThreadNum, 4, seq_len, seq_len}));
@@ -254,12 +254,11 @@ ErrorCode CPUAttentionImpl::onResize(Backend* _backend, const std::vector<Tensor
     return NO_ERROR;
 }
 
-ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     auto core = static_cast<CPUBackend *>(backend())->functions();
     int unit  =  core->pack;
     bytes = core->bytes;
     core->MNNGetMatMulPackMode(&eP, &lP, &hP);
-    mBackend = _backend;
     auto matmulUnit   = core->MNNPackedMatMul;
     auto matmulRemain = core->MNNPackedMatMulRemain;
 
@@ -272,37 +271,40 @@ ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vector<Tenso
     int seq_len = shape[1];
     mThreadNum = ((CPUBackend *)backend())->threadNumber();
     mIsDecode = seq_len == 1;
-    if (mPastLength == 0 || seq_len > 1) {
-        mPastLength = seq_len;
+    if (mResource->mPastLength == 0 || seq_len > 1) {
+        mResource->mPastLength = seq_len;
     }
-    mNumHead = shape[2];
-    mHeadDim = shape[3];
-    mScale = 1.0 / sqrt(mHeadDim);
-    mValueH = UP_DIV(mHeadDim, hP);
+    mResource->mNumHead = shape[2];
+    mResource->mKvNumHead = key->shape()[2];
+    int group_size = mResource->mNumHead / mResource->mKvNumHead;
+    mResource->mHeadDim = shape[3];
+    mResource->mScale = 1.0 / sqrt(mResource->mHeadDim);
+    mResource->mValueH = UP_DIV(mResource->mHeadDim, hP);
     int query_e = UP_DIV(seq_len, eP);
     int key_h = UP_DIV(seq_len, hP);
     // mPastLength = 10;
 
-    int tileCount = UP_DIV(mNumHead, mThreadNum);
+    int tileCount = UP_DIV(mResource->mNumHead, mThreadNum);
 
     // try calloc kv cache
     mPrefill = [=](int tId){
-        auto pack_q     = mPackQ->host<char>() + tId * query_e * mHeadDim * eP * bytes;
+        auto pack_q     = mPackQ->host<char>() + tId * query_e * mResource->mHeadDim * eP * bytes;
         auto pack_qk    = mTempQK->host<char>() + tId * 4 * seq_len * seq_len * bytes;
         auto unpack_qk  = pack_qk + seq_len * seq_len * 2 * bytes;
         auto mask_qk    = reinterpret_cast<float*>(pack_qk);
         auto softmax_qk = reinterpret_cast<float*>(unpack_qk);
-        auto pack_qkv   = mPackQKV->host<char>() + tId * UP_DIV(mHeadDim, unit) * seq_len * unit * bytes;
+        auto pack_qkv   = mPackQKV->host<char>() + tId * UP_DIV(mResource->mHeadDim, unit) * seq_len * unit * bytes;
 
         int head_index = tId * tileCount;
-        for (int h = head_index; h < head_index + tileCount && h < mNumHead; h++) {
+        for (int h = head_index; h < head_index + tileCount && h < mResource->mNumHead; h++) {
             // pack for matmul
-            auto key_dst = mPastKey->host<char>() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * bytes;
-            auto value_dst = mPastValue->host<char>() + h * mValueH * mMaxLength * hP * bytes;
+            int kv_h = h / group_size;
+            auto key_dst = mResource->mPastKey->host<char>() + kv_h * UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP * bytes;
+            auto value_dst = mResource->mPastValue->host<char>() + kv_h * mResource->mValueH * mResource->mMaxLength * hP * bytes;
             if (bytes == 2) {
-                prefill_pack<int16_t>(query, key, value, pack_q, key_dst, value_dst, mMaxLength, mNumHead, mHeadDim, mValueH, eP, hP, query_e, key_h, seq_len, h);
+                prefill_pack<int16_t>(query, key, value, pack_q, key_dst, value_dst, mResource->mMaxLength, mResource->mNumHead, mResource->mKvNumHead, mResource->mHeadDim, mResource->mValueH, eP, hP, query_e, key_h, seq_len, h, kv_h);
             } else {
-                prefill_pack<float>(query, key, value, pack_q, key_dst, value_dst, mMaxLength, mNumHead, mHeadDim, mValueH, eP, hP, query_e, key_h, seq_len, h);
+                prefill_pack<float>(query, key, value, pack_q, key_dst, value_dst, mResource->mMaxLength, mResource->mNumHead, mResource->mKvNumHead, mResource->mHeadDim, mResource->mValueH, eP, hP, query_e, key_h, seq_len, h, kv_h);
             }
             // query @ key
             int loop_e = seq_len / eP;
@@ -311,32 +313,32 @@ ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vector<Tenso
                 size_t shapeParameters[6];
                 size_t* parameters = shapeParameters;
                 parameters[0]          = eP * bytes;
-                parameters[1]          = mHeadDim;
+                parameters[1]          = mResource->mHeadDim;
                 parameters[2]          = seq_len;
                 parameters[3]          = seq_len * unit * bytes;
                 parameters[4]          = 0;
                 parameters[5]          = 0;
-                matmulUnit((float*)(pack_qk + (i * eP * unit) * bytes), (float*)(pack_q + (i * mHeadDim * eP) * bytes), (float*)key_dst, parameters, nullptr, nullptr, nullptr, nullptr);
+                matmulUnit((float*)(pack_qk + (i * eP * unit) * bytes), (float*)(pack_q + (i * mResource->mHeadDim * eP) * bytes), (float*)key_dst, parameters, nullptr, nullptr, nullptr, nullptr);
             }
             {
                 size_t shapeParameters[6];
                 size_t* parameters = shapeParameters;
                 parameters[0]          = eP * bytes;
-                parameters[1]          = mHeadDim;
+                parameters[1]          = mResource->mHeadDim;
                 parameters[2]          = seq_len;
                 parameters[3]          = seq_len * unit * bytes;
                 parameters[4]          = 0;
                 parameters[5]          = 0;
-                matmulRemain((float*)(pack_qk + (loop_e * eP * unit) * bytes), (float*)(pack_q + (loop_e * mHeadDim * eP) * bytes), (float*)key_dst, remain, parameters, nullptr, nullptr, nullptr, nullptr);
+                matmulRemain((float*)(pack_qk + (loop_e * eP * unit) * bytes), (float*)(pack_q + (loop_e * mResource->mHeadDim * eP) * bytes), (float*)key_dst, remain, parameters, nullptr, nullptr, nullptr, nullptr);
             }
             int area_offset[1] {seq_len};
             core->MNNUnpackCUnitTranspose((float*)unpack_qk, (float*)pack_qk, seq_len, seq_len, area_offset);
             // div scale and mask
             auto mask_ptr = mask->host<int>();
             if (bytes == 2) {
-                prefill_softmax<FLOAT16_T>(mask_ptr, mask_qk, softmax_qk, unpack_qk, pack_qk, mScale, eP, query_e, seq_len, -65504.0, float_mask);
+                prefill_softmax<FLOAT16_T>(mask_ptr, mask_qk, softmax_qk, unpack_qk, pack_qk, mResource->mScale, eP, query_e, seq_len, -65504.0, float_mask);
             } else {
-                prefill_softmax<float>(mask_ptr, mask_qk, softmax_qk, unpack_qk, pack_qk, mScale, eP, query_e, seq_len, std::numeric_limits<float>::lowest(), float_mask);
+                prefill_softmax<float>(mask_ptr, mask_qk, softmax_qk, unpack_qk, pack_qk, mResource->mScale, eP, query_e, seq_len, std::numeric_limits<float>::lowest(), float_mask);
             }
             // qk @ v
             for (int i = 0 ; i < loop_e; i++) {
@@ -344,10 +346,10 @@ ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vector<Tenso
                 size_t* parameters = shapeParameters;
                 parameters[0]          = eP * bytes;
                 parameters[1]          = seq_len;
-                parameters[2]          = mHeadDim;
+                parameters[2]          = mResource->mHeadDim;
                 parameters[3]          = seq_len * unit * bytes;
                 parameters[4]          = 0;
-                parameters[5]          = (mMaxLength - seq_len) * hP * bytes;
+                parameters[5]          = (mResource->mMaxLength - seq_len) * hP * bytes;
                 matmulUnit((float*)(pack_qkv + (i * eP * unit) * bytes), (float*)(pack_qk + (i * seq_len * eP) * bytes), (float*)value_dst, parameters, nullptr, nullptr, nullptr, nullptr);
             }
             {
@@ -355,46 +357,47 @@ ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vector<Tenso
                 size_t* parameters = shapeParameters;
                 parameters[0]          = eP * bytes;
                 parameters[1]          = seq_len;
-                parameters[2]          = mHeadDim;
+                parameters[2]          = mResource->mHeadDim;
                 parameters[3]          = seq_len * unit * bytes;
                 parameters[4]          = 0;
-                parameters[5]          = (mMaxLength - seq_len) * hP * bytes;
+                parameters[5]          = (mResource->mMaxLength - seq_len) * hP * bytes;
                 matmulRemain((float*)(pack_qkv + (loop_e * eP * unit) * bytes), (float*)(pack_qk + (loop_e * seq_len * eP) * bytes), (float*)value_dst, remain, parameters, nullptr, nullptr, nullptr, nullptr);
             }
             // transpose: [head_dim/unit, seq_len, unit] -> [seq_len, num_head, head_dim]
-            auto dst_ptr = outputs[0]->host<char>() + h * mHeadDim * bytes;
+            auto dst_ptr = outputs[0]->host<char>() + h * mResource->mHeadDim * bytes;
             if (bytes == 2) {
-                prefill_unpack<int16_t>(pack_qkv, dst_ptr, mNumHead, mHeadDim, unit, seq_len);
+                prefill_unpack<int16_t>(pack_qkv, dst_ptr, mResource->mNumHead, mResource->mHeadDim, unit, seq_len);
             } else {
-                prefill_unpack<float>(pack_qkv, dst_ptr, mNumHead, mHeadDim, unit, seq_len);
+                prefill_unpack<float>(pack_qkv, dst_ptr, mResource->mNumHead, mResource->mHeadDim, unit, seq_len);
             }
         }
     };
 
     mDecode = [=](int tId) {
-        int kv_seq_len  = mPastLength + 1;
-        auto pack_q     = mPackQ->host<char>() + tId * mHeadDim * eP * bytes;
+        int kv_seq_len  = mResource->mPastLength + 1;
+        auto pack_q     = mPackQ->host<char>() + tId * mResource->mHeadDim * eP * bytes;
         auto pack_qk    = mTempQK->host<char>() + tId * (eP + 2) * kv_seq_len * bytes;
         auto unpack_qk  = pack_qk + kv_seq_len * eP * bytes;
         auto mask_qk    = reinterpret_cast<float*>(pack_qk);
         auto softmax_qk = reinterpret_cast<float*>(unpack_qk);
-        auto pack_qkv   = mPackQKV->host<char>() + tId * UP_DIV(mHeadDim, unit) * unit * bytes;
+        auto pack_qkv   = mPackQKV->host<char>() + tId * UP_DIV(mResource->mHeadDim, unit) * unit * bytes;
 
         int head_index = tId * tileCount;
-        for (int h = head_index; h < head_index + tileCount && h < mNumHead; h++) {
-            auto key_dst = mPastKey->host<char>() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * bytes;
-            auto value_dst = mPastValue->host<char>() + h * mValueH * mMaxLength * hP * bytes;
+        for (int h = head_index; h < head_index + tileCount && h < mResource->mNumHead; h++) {
+            int kv_h = h / group_size;
+            auto key_dst = mResource->mPastKey->host<char>() + kv_h * UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP * bytes;
+            auto value_dst = mResource->mPastValue->host<char>() + kv_h * mResource->mValueH * mResource->mMaxLength * hP * bytes;
             // pack for matmul
             if (bytes == 2) {
-                decode_pack<int16_t>(query, key, value, pack_q, key_dst, value_dst, mMaxLength, mPastLength, mHeadDim, mValueH, eP, hP, h);
+                decode_pack<int16_t>(query, key, value, pack_q, key_dst, value_dst, mResource->mMaxLength, mResource->mPastLength, mResource->mHeadDim, mResource->mValueH, eP, hP, h, kv_h);
             } else {
-                decode_pack<float>(query, key, value, pack_q, key_dst, value_dst, mMaxLength, mPastLength, mHeadDim, mValueH, eP, hP, h);
+                decode_pack<float>(query, key, value, pack_q, key_dst, value_dst, mResource->mMaxLength, mResource->mPastLength, mResource->mHeadDim, mResource->mValueH, eP, hP, h, kv_h);
             }
             // query @ key: [1, head_dim] @ [head_dim, kv_seq_len] -> [1, kv_seq_len]
             size_t shapeParameters[6];
             size_t* parameters = shapeParameters;
             parameters[0]          = eP * bytes;
-            parameters[1]          = mHeadDim;
+            parameters[1]          = mResource->mHeadDim;
             parameters[2]          = kv_seq_len;
             parameters[3]          = seq_len * unit * bytes;
             parameters[4]          = 0;
@@ -403,9 +406,9 @@ ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vector<Tenso
             int area_offset[1] {seq_len};
             core->MNNUnpackCUnitTranspose((float*)unpack_qk, (float*)pack_qk, seq_len, kv_seq_len, area_offset);
             if (bytes == 2) {
-                decode_softmax<FLOAT16_T>(mask_qk, softmax_qk, unpack_qk, pack_qk, mScale, eP, kv_seq_len);
+                decode_softmax<FLOAT16_T>(mask_qk, softmax_qk, unpack_qk, pack_qk, mResource->mScale, eP, kv_seq_len);
             } else {
-                decode_softmax<float>(mask_qk, softmax_qk, unpack_qk, pack_qk, mScale, eP, kv_seq_len);
+                decode_softmax<float>(mask_qk, softmax_qk, unpack_qk, pack_qk, mResource->mScale, eP, kv_seq_len);
             }
             // qk @ v: [1, kv_seq_len] @ [kv_seq_len, head_dim] -> [1, head_dim]
             {
@@ -413,14 +416,14 @@ ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vector<Tenso
                 size_t* parameters = shapeParameters;
                 parameters[0]          = eP * bytes;
                 parameters[1]          = kv_seq_len;
-                parameters[2]          = mHeadDim;
+                parameters[2]          = mResource->mHeadDim;
                 parameters[3]          = 1 * unit * bytes;
-                parameters[5]          = (mMaxLength - kv_seq_len) * hP * bytes;
+                parameters[5]          = (mResource->mMaxLength - kv_seq_len) * hP * bytes;
                 matmulRemain((float*)pack_qkv, (float*)pack_qk, (float*)value_dst, 1, parameters, nullptr, nullptr, nullptr, nullptr);
             }
             // transpose: [head_dim/unit, 1, unit] -> [1, num_head, head_dim]
-            auto dst_ptr = outputs[0]->host<char>() + h * mHeadDim * bytes;
-            core->MNNUnpackCUnitTranspose((float*)dst_ptr, (float*)pack_qkv, 1, mHeadDim, area_offset);
+            auto dst_ptr = outputs[0]->host<char>() + h * mResource->mHeadDim * bytes;
+            core->MNNUnpackCUnitTranspose((float*)dst_ptr, (float*)pack_qkv, 1, mResource->mHeadDim, area_offset);
         }
     };
     mFunction = mIsDecode ? mDecode : mPrefill;
@@ -430,32 +433,25 @@ ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vector<Tenso
         mFunction((int)tId);
     }
     MNN_CONCURRENCY_END();
-    mPastLength += mIsDecode;
+    mResource->mPastLength += mIsDecode;
     return NO_ERROR;
 }
 
-CPUAttention::CPUAttention(Backend* backend, bool kv_cahce) : Execution(backend) {
-    mImpl.reset(new CPUAttentionImpl(backend, kv_cahce));
-}
-
-CPUAttention::CPUAttention(std::shared_ptr<CPUAttentionImpl> impl, Backend *backend) : Execution(backend), mImpl(impl) {}
-
-ErrorCode CPUAttention::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    return mImpl->onResize(backend(), inputs, outputs);
-}
-
-ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    return mImpl->onExecute(backend(), inputs, outputs);
-}
-
 bool CPUAttention::onClone(Backend* bn, const Op* op, Execution** dst) {
     if (nullptr == dst) {
         return true;
     }
-    *dst = new CPUAttention(mImpl, bn);
+    auto tmp = new CPUAttention(bn, mKVCache);
+    tmp->mResource = mResource;
+    *dst = tmp;
     return true;
 }
 
+CPUAttention::CPUAttention(Backend *backend, bool kv_cache) : Execution(backend) {
+    mKVCache = kv_cache;
+    mResource.reset(new Resource);
+}
+
 class CPUAttentionCreator : public CPUBackend::Creator {
 public:
     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
diff --git a/source/backend/cpu/CPUAttention.hpp b/source/backend/cpu/CPUAttention.hpp
index 1cbc26aba..6e3154db7 100644
--- a/source/backend/cpu/CPUAttention.hpp
+++ b/source/backend/cpu/CPUAttention.hpp
@@ -16,41 +16,32 @@
 
 namespace MNN {
 
-class CPUAttentionImpl {
-public:
-    CPUAttentionImpl(Backend *backend, bool kv_cache) : mBackend(backend), mKVCache(kv_cache) {}
-    ~CPUAttentionImpl() = default;
-    ErrorCode onResize(Backend *backend, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
-    ErrorCode onExecute(Backend *backend, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
-private:
-    void allocKVCache();
-    void reallocKVCache();
-    Backend* backend() { return mBackend; }
-private:
-    Backend* mBackend;
-    bool mKVCache;
-    float mScale;
-    const int mExpandChunk = 64;
-    int mThreadNum = 1;
-    bool mIsDecode = false;
-    int mPastLength = 0, mMaxLength = 0;
-    std::shared_ptr<Tensor> mPastKey, mPastValue, mTempQK;
-    std::shared_ptr<Tensor> mPackQ, mPackQKV;
-    int mNumHead = 0, mHeadDim = 0, mValueH = 0;
-    int eP, lP, hP, bytes;
-    std::function<void(int)> mFunction, mPrefill, mDecode;
-};
 
 class CPUAttention : public Execution {
 public:
     CPUAttention(Backend *backend, bool kv_cache);
-    CPUAttention(std::shared_ptr<CPUAttentionImpl> impl, Backend *backend);
     virtual ~CPUAttention() = default;
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
+    struct Resource {
+        std::shared_ptr<Tensor> mPastKey;
+        std::shared_ptr<Tensor> mPastValue;
+        float mScale;
+        const int mExpandChunk = 64;
+        int mPastLength = 0, mMaxLength = 0;
+        int mNumHead = 0, mKvNumHead = 0, mHeadDim = 0, mValueH = 0;
+    };
 private:
-    std::shared_ptr<CPUAttentionImpl> mImpl;
+    void allocKVCache();
+    void reallocKVCache();
+    bool mIsDecode = false;
+    bool mKVCache;
+    int mThreadNum = 1;
+    std::shared_ptr<Resource> mResource;
+    std::shared_ptr<Tensor> mTempQK, mPackQ, mPackQKV;
+    int eP, lP, hP, bytes;
+    std::function<void(int)> mFunction, mPrefill, mDecode;
 };
 } // namespace MNN
 
diff --git a/source/backend/cpu/CPURaster.cpp b/source/backend/cpu/CPURaster.cpp
index 6d5f587b7..3722e8a27 100644
--- a/source/backend/cpu/CPURaster.cpp
+++ b/source/backend/cpu/CPURaster.cpp
@@ -93,6 +93,7 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
     }
     // input is NC4HW4 add Convert
     std::vector<Tensor*> forRelease;
+    TensorUtils::FuseWrap fuseUtils;
     for (int i=0; i< des->regions.size(); ++i) {
         auto& slice = des->regions[i];
         auto origin = slice.origin;
@@ -125,10 +126,11 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
             regionTmp.size[1] = core->pack;
             regionTmp.size[2] = area;
             regionTmp.origin = slice.origin;
-            std::shared_ptr<Tensor::InsideDescribe::Region> newSlice(new Tensor::InsideDescribe::Region);
-            *newSlice = slice;
-            bool merge = TensorUtils::fuseRegion(regionTmp, *newSlice);
+            bool merge = fuseUtils.match(regionTmp, slice);
             if (merge) {
+                std::shared_ptr<Tensor::InsideDescribe::Region> newSlice(new Tensor::InsideDescribe::Region);
+                *newSlice = slice;
+                fuseUtils.apply(regionTmp, *newSlice);
                 // cache the merged tensor
                 mTempInputCopy.emplace_back(std::make_pair(origin, newSlice.get()));
                 mCacheRegions.emplace_back(newSlice);
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int4.S b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int4.S
index e0dabde36..337c419a6 100644
--- a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int4.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int4.S
@@ -12,6 +12,13 @@
 
 .text
 .align 5
+.macro MNN_ADD_FLAOT s0, s1, s2, s3, z0, z1, z2, z3
+    fadd \s0\().4s, \s0\().4s, \z0\().4s
+    fadd \s1\().4s, \s1\().4s, \z1\().4s
+    fadd \s2\().4s, \s2\().4s, \z2\().4s
+    fadd \s3\().4s, \s3\().4s, \z3\().4s
+
+.endm
 // 12 * 8 MatMul
 asm_function MNNPackedMatMulRemain_int4
 //void MNNPackedMatMulRemain_int4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b);
@@ -36,8 +43,10 @@ ldr x9, [x4, #8] // l
 ldr x10, [x4, #16] // h
 
 ldr x7, [x4, #24] // cStride
-ldr x19, [x4, #40] // bExtraStride
+ldr x19, [x4, #40] // bExtraStride = (LSize - l) * (hP * sizeof(int4_t))
+ldr x26, [x4, #48] // blockId
 
+//add x19, x19, x9, LSL #2 // bStride = (hP * sizeof(int4_t)) * l + bExtraStride
 add x10, x10, #3
 lsr x10, x10, #2
 
@@ -47,6 +56,7 @@ dup v6.4s, v5.s[2] // Min Value
 dup v7.4s, v5.s[3] // Max Value
 
 Start:
+sub x25, x7, #64
 
 E8:
 cmp x3, #8
@@ -195,8 +205,20 @@ LoopE8:
         fmla v30.4s, v15.4s, v1.s[2]
         fmla v31.4s, v15.4s, v1.s[3]
 
-        cbz x5, StoreLH8
+        cbz x26, AddBiasLH8
+        ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+        ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], x25
+        MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3
+        MNN_ADD_FLAOT v24, v25, v26, v27, v8, v9, v10, v11
+        ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+        ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0]
+        MNN_ADD_FLAOT v20, v21, v22, v23, v0, v1, v2, v3
+        MNN_ADD_FLAOT v28, v29, v30, v31, v8, v9, v10, v11
+        sub x0, x0, #128
+        sub x0, x0, x25
+    
         AddBiasLH8:
+        cbz x20, PostTreatLH8
         ld1 {v0.4s, v1.4s}, [x20], #32
 
         fmla v16.4s, v0.4s, v5.s[1]
@@ -220,6 +242,7 @@ LoopE8:
         fmla v31.4s, v1.4s, v5.s[1]
 
         PostTreatLH8:
+        cbz x5, StoreLH8
         fmax v16.4s, v16.4s, v6.4s
         fmax v17.4s, v17.4s, v6.4s
         fmax v18.4s, v18.4s, v6.4s
@@ -355,8 +378,22 @@ LoopE8:
         fmla v22.4s, v14.4s, v1.s[2]
         fmla v23.4s, v14.4s, v1.s[3]
 
-        cbz x5, StoreLH8x4
+        cbz x26, AddBiasLH8x4
+        ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+        ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0]
+        fadd v16.4s, v16.4s, v0.4s
+        fadd v17.4s, v17.4s, v1.4s   
+        fadd v18.4s, v18.4s, v2.4s
+        fadd v19.4s, v19.4s, v3.4s
+
+        fadd v20.4s, v20.4s, v8.4s 
+        fadd v21.4s, v21.4s, v9.4s  
+        fadd v22.4s, v22.4s, v10.4s
+        fadd v23.4s, v23.4s, v11.4s
+        sub x0, x0, #64
+
         AddBiasLH8x4:
+        cbz x20, PostTreatLH8x4
         ld1 {v0.4s}, [x20]
 
         fmla v16.4s, v0.4s, v5.s[1]
@@ -370,6 +407,7 @@ LoopE8:
         fmla v23.4s, v0.4s, v5.s[1]
 
         PostTreatLH8x4:
+        cbz x5, StoreLH8x4
         fmax v16.4s, v16.4s, v6.4s
         fmax v17.4s, v17.4s, v6.4s
         fmax v18.4s, v18.4s, v6.4s
@@ -444,6 +482,7 @@ blt E1
         fadd v4.4s, v4.4s, v15.4s
 
         ld1 {v0.4s}, [x15], x11
+        cbnz x26, LE4H8_BLOCK_GT_0
         fmul v16.4s, v3.4s, v0.s[0]
         fmul v17.4s, v3.4s, v0.s[1]
         fmul v18.4s, v3.4s, v0.s[2]
@@ -453,7 +492,24 @@ blt E1
         fmul v21.4s, v4.4s, v0.s[1]
         fmul v22.4s, v4.4s, v0.s[2]
         fmul v23.4s, v4.4s, v0.s[3]
+        b LE4H8_INIT_END
+
+        LE4H8_BLOCK_GT_0:
+        ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x7
+        ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0]
+        fmla v16.4s, v3.4s, v0.s[0]
+        fmla v17.4s, v3.4s, v0.s[1]
+        fmla v18.4s, v3.4s, v0.s[2]
+        fmla v19.4s, v3.4s, v0.s[3]
 
+        fmla v20.4s, v4.4s, v0.s[0]
+        fmla v21.4s, v4.4s, v0.s[1]
+        fmla v22.4s, v4.4s, v0.s[2]
+        fmla v23.4s, v4.4s, v0.s[3]
+        sub x0, x0, x7
+
+        
+        LE4H8_INIT_END:
         beq E4LoopLEnd
 
         subs x12, x12, #1
@@ -524,9 +580,8 @@ blt E1
         sub x8, x8, #2
         cmp x8, #2
 
-        cbz x5, StoreLH4x8
-
         AddBiasLH4x8:
+        cbz x20, PostTreatLH4x8
         ld1 {v0.4s, v1.4s}, [x20], #32
 
         fmla v16.4s, v0.4s, v5.s[1]
@@ -540,6 +595,7 @@ blt E1
         fmla v23.4s, v1.4s, v5.s[1]
 
         PostTreatLH4x8:
+        cbz x5, StoreLH4x8
         fmax v16.4s, v16.4s, v6.4s
         fmax v17.4s, v17.4s, v6.4s
         fmax v18.4s, v18.4s, v6.4s
@@ -593,11 +649,22 @@ blt E1
     fadd v3.4s, v3.4s, v14.4s
 
     ld1 {v0.4s}, [x15], x11
+
+    cbnz x26, LE4H4_BLOCK_GT_0
     fmul v16.4s, v3.4s, v0.s[0]
     fmul v17.4s, v3.4s, v0.s[1]
     fmul v18.4s, v3.4s, v0.s[2]
     fmul v19.4s, v3.4s, v0.s[3]
+    b LE4H4_INIT_END
 
+    LE4H4_BLOCK_GT_0:
+    ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0]
+    fmla v16.4s, v3.4s, v0.s[0]
+    fmla v17.4s, v3.4s, v0.s[1]
+    fmla v18.4s, v3.4s, v0.s[2]
+    fmla v19.4s, v3.4s, v0.s[3]
+
+    LE4H4_INIT_END:
     beq E4LoopLREnd
 
     E4LoopLR:
@@ -623,8 +690,8 @@ blt E1
         bne E4LoopLR
     E4LoopLREnd:
 
-    cbz x5, StoreLH4x4
     AddBiasLH4x4:
+    cbz x20, PostTreatLH4x4
     ld1 {v0.4s}, [x20]
 
     fmla v16.4s, v0.4s, v5.s[1]
@@ -634,6 +701,7 @@ blt E1
 
 
     PostTreatLH4x4:
+    cbz x5, StoreLH4x4
     fmax v16.4s, v16.4s, v6.4s
     fmax v17.4s, v17.4s, v6.4s
     fmax v18.4s, v18.4s, v6.4s
@@ -777,14 +845,22 @@ LoopE1:
         fadd v16.4s, v14.4s, v16.4s
         fadd v20.4s, v15.4s, v20.4s
 
-        cbz x5, StoreLH1x8
+        cbz x26, AddBiasLH1x8
+        ld1 {v2.4s}, [x0], x7
+        ld1 {v3.4s}, [x0]
+        sub x0, x0, x7
+        fadd v16.4s, v16.4s, v2.4s
+        fadd v20.4s, v20.4s, v3.4s
+
         AddBiasLH1x8:
+        cbz x20, PostTreatLH1x8
         ld1 {v0.4s, v1.4s}, [x20], #32
 
         fmla v16.4s, v0.4s, v5.s[1]
         fmla v20.4s, v1.4s, v5.s[1]
 
         PostTreatLH1x8:
+        cbz x5, StoreLH1x8
         fmax v16.4s, v16.4s, v6.4s
         fmax v20.4s, v20.4s, v6.4s
         fmin v16.4s, v16.4s, v7.4s
@@ -839,12 +915,18 @@ LoopE1:
     fmul v16.4s, v12.4s, v16.4s
     fmla v16.4s, v14.4s, v4.s[0]
 
-    cbz x5, StoreLH1x4
+    cbz x26, AddBiasLH1x4
+    ld1 {v0.4s}, [x0]
+    fadd v16.4s, v16.4s, v0.4s
+    b PostTreatLH1x4
+
     AddBiasLH1x4:
+    cbz x20, PostTreatLH1x4
     ld1 {v0.4s}, [x20]
     fmla v16.4s, v0.4s, v5.s[1]
 
     PostTreatLH1x4:
+    cbz x5, StoreLH1x4
     fmax v16.4s, v16.4s, v6.4s
     fmin v16.4s, v16.4s, v7.4s
 
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int8.S b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int8.S
index 4daaf415c..1b9677c13 100644
--- a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int8.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int8.S
@@ -12,6 +12,14 @@
 
 .text
 .align 5
+
+.macro MNN_ADD_FLAOT s0, s1, s2, s3, z0, z1, z2, z3
+    fadd \s0\().4s, \s0\().4s, \z0\().4s
+    fadd \s1\().4s, \s1\().4s, \z1\().4s
+    fadd \s2\().4s, \s2\().4s, \z2\().4s
+    fadd \s3\().4s, \s3\().4s, \z3\().4s
+
+.endm
 // 12 * 8 MatMul
 asm_function MNNPackedMatMulRemain_int8
 //void MNNPackedMatMulRemain_int4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b);
@@ -34,17 +42,19 @@ ldr x10, [x4, #16] // h
 
 ldr x7, [x4, #24] // cStride
 ldr x19, [x4, #40] // bExtraStride
+ldr x26, [x4, #48] // blockId
 
 add x10, x10, #3
 lsr x10, x10, #2
-lsl x25, x9, #3 // l*hPack
-add x25, x25, x19
+//lsl x25, x9, #3 // l*hPack
+//add x25, x25, x19
 cbz x5, Start
 ld1 {v5.4s}, [x5]
 dup v6.4s, v5.s[2] // Min Value
 dup v7.4s, v5.s[3] // Max Value
 
 Start:
+sub x25, x7, #64
 
 E8:
 cmp x3, #8
@@ -143,8 +153,21 @@ LoopE8:
         sub x8, x8, #2
         cmp x8, #2
 
-        cbz x5, StoreLH8
+
+        cbz x26, AddBiasLH8
+        ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+        MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3
+        ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], x25
+        MNN_ADD_FLAOT v24, v25, v26, v27, v0, v1, v2, v3
+        ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+        MNN_ADD_FLAOT v20, v21, v22, v23, v0, v1, v2, v3
+        ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0]
+        MNN_ADD_FLAOT v28, v29, v30, v31, v0, v1, v2, v3
+        sub x0, x0, #128
+        sub x0, x0, x25
+
         AddBiasLH8:
+        cbz x5, StoreLH8
         ld1 {v0.4s, v1.4s}, [x20], #32
 
         fmla v16.4s, v0.4s, v5.s[1]
@@ -277,8 +300,15 @@ LoopE8:
             bne LoopLR
         LoopLREnd:
 
-        cbz x5, StoreLH8x4
+        cbz x26, AddBiasLH8x4
+        ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+        MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3
+        ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0]
+        MNN_ADD_FLAOT v20, v21, v22, v23, v0, v1, v2, v3
+        sub x0, x0, #64
+
         AddBiasLH8x4:
+        cbz x20, PostTreatLH8x4
         ld1 {v0.4s}, [x20]
 
         fmla v16.4s, v0.4s, v5.s[1]
@@ -292,6 +322,7 @@ LoopE8:
         fmla v23.4s, v0.4s, v5.s[1]
 
         PostTreatLH8x4:
+        cbz x5, StoreLH8x4
         fmax v16.4s, v16.4s, v6.4s
         fmax v17.4s, v17.4s, v6.4s
         fmax v18.4s, v18.4s, v6.4s
@@ -433,9 +464,15 @@ blt E1
         sub x8, x8, #2
         cmp x8, #2
 
-        cbz x5, StoreLH4x8
+        cbz x26, AddBiasLH4x8
+        ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], x7
+        MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3
+        ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0]
+        MNN_ADD_FLAOT v20, v21, v22, v23, v0, v1, v2, v3
+        sub x0, x0, x7
 
         AddBiasLH4x8:
+        cbz x20, PostTreatLH4x8
         ld1 {v0.4s, v1.4s}, [x20], #32
 
         fmla v16.4s, v0.4s, v5.s[1]
@@ -449,6 +486,7 @@ blt E1
         fmla v23.4s, v1.4s, v5.s[1]
 
         PostTreatLH4x8:
+        cbz x5, StoreLH4x8
         fmax v16.4s, v16.4s, v6.4s
         fmax v17.4s, v17.4s, v6.4s
         fmax v18.4s, v18.4s, v6.4s
@@ -523,8 +561,12 @@ blt E1
         bne E4LoopLR
     E4LoopLREnd:
 
-    cbz x5, StoreLH4x4
+    cbz x26, AddBiasLH4x4
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0]
+    MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3
+
     AddBiasLH4x4:
+    cbz x5, StoreLH4x4
     ld1 {v0.4s}, [x20]
 
     fmla v16.4s, v0.4s, v5.s[1]
@@ -620,8 +662,15 @@ LoopE1:
         sub x8, x8, #2
         cmp x8, #2
 
-        cbz x5, StoreLH1x8
+        cbz x26, AddBiasLH1x8
+        ld1 {v0.4s}, [x0], x7
+        ld1 {v1.4s}, [x0]
+        fadd v16.4s, v16.4s, v0.4s
+        fadd v20.4s, v20.4s, v1.4s
+        sub x0, x0, x7
+
         AddBiasLH1x8:
+        cbz x5, StoreLH1x8
         ld1 {v0.4s, v1.4s}, [x20], #32
 
         fmla v16.4s, v0.4s, v5.s[1]
@@ -676,8 +725,12 @@ LoopE1:
         bne E1LoopLR
     E1LoopLREnd:
 
-    cbz x5, StoreLH1x4
+    cbz x26, AddBiasLH1x4
+    ld1 {v0.4s}, [x0]
+    fadd v16.4s, v16.4s, v0.4s
+
     AddBiasLH1x4:
+    cbz x5, StoreLH1x4
     ld1 {v0.4s}, [x20]
     fmla v16.4s, v0.4s, v5.s[1]
 
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int4.S b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int4.S
index 4528e96dd..fe2691168 100644
--- a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int4.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int4.S
@@ -11,22 +11,36 @@
 
 .text
 .align 5
+
+.macro MNN_ADD_FLAOT s0, s1, s2, s3, z0, z1, z2, z3
+    fadd \s0\().4s, \s0\().4s, \z0\().4s
+    fadd \s1\().4s, \s1\().4s, \z1\().4s
+    fadd \s2\().4s, \s2\().4s, \z2\().4s
+    fadd \s3\().4s, \s3\().4s, \z3\().4s
+
+.endm
+
 // 12 * 8 MatMul
 asm_function MNNPackedMatMul_int4
 //void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias);
 // x0: C, x1:A, x2:B, x3:parameter, x4: postParameters, x5:bias, x6: k, x7: b
-stp d14, d15, [sp, #-80]!
+stp d14, d15, [sp, #-112]!
 stp d12, d13, [sp, #16]
 stp d10, d11, [sp, #32]
 stp d8,  d9,  [sp, #48]
 stp x19, x20, [sp, #64]
+stp x21, x22, [sp, #80]
+stp x23, x24, [sp, #96]
 
 //ldr x8, [x3, #0] // deprecated
 ldr x9, [x3, #8] // l
 ldr x10, [x3, #16] // h
 
 ldr x13, [x3, #24] // cStride
-ldr x11, [x3, #40] // bExtraStride
+ldr x11, [x3, #40] // bExtraStride = (LSize - l) * (hP * sizeof(int4_t))
+ldr x21, [x3, #48] // blockId
+
+//add x11, x11, x9, LSL #2 // bStride = (hP * sizeof(int4_t)) * l + bExtraStride
 
 // v0, v1, v2: A
 // v3, v4: B
@@ -45,7 +59,7 @@ cmp x10, #2
 blt LH4
 
 LH8:
-// sub x14, x13, #160
+sub x14, x13, #128
 LoopH:
 
     mov x15, x1
@@ -231,14 +245,33 @@ LoopH:
     fmla v30.4s, v7.4s, v2.s[2]
     fmla v31.4s, v7.4s, v2.s[3]
 
-    cbz x4, StoreLH8
+    cbz x21, AddBiasLH8
+    // add dst value
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+    MNN_ADD_FLAOT v8, v9, v10, v11, v0, v1, v2, v3
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+    MNN_ADD_FLAOT v12, v13, v14, v15, v0, v1, v2, v3
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], x14
+    MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3
+
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+    MNN_ADD_FLAOT v20, v21, v22, v23, v0, v1, v2, v3
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+    MNN_ADD_FLAOT v24, v25, v26, v27, v0, v1, v2, v3
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0]
+    MNN_ADD_FLAOT v28, v29, v30, v31, v0, v1, v2, v3
+    sub x0, x0, #256
+    sub x0, x0, x14
 
+    
     AddBiasLH8:
+    cbz x4, StoreLH8
     ld1 {v5.4s}, [x4]
     dup v6.4s, v5.s[2] // Min Value
     dup v7.4s, v5.s[3] // Max Value
-    ld1 {v0.4s, v1.4s}, [x5], #32
+    cbz x5, PostTreatLH8
 
+    ld1 {v0.4s, v1.4s}, [x5], #32
     fmla v8.4s, v0.4s, v5.s[1]
     fmla v9.4s, v0.4s, v5.s[1]
     fmla v10.4s, v0.4s, v5.s[1]
@@ -452,11 +485,22 @@ LoopHRemain:
     fmla v18.4s, v21.4s, v6.s[2]
     fmla v19.4s, v21.4s, v6.s[3]
 
-    cbz x4, StoreLH4
+    cbz x21, AddBiasLH4
+    // add dst value
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+    MNN_ADD_FLAOT v8, v9, v10, v11, v0, v1, v2, v3
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+    MNN_ADD_FLAOT v12, v13, v14, v15, v0, v1, v2, v3
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0]
+    MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3
+    sub x0, x0, #128
+
     AddBiasLH4:
+    cbz x4, StoreLH4
     ld1 {v5.4s}, [x4]
     dup v6.4s, v5.s[2] // Min Value
     dup v7.4s, v5.s[3] // Max Value
+    cbz x5, PostTreatLH4
     ld1 {v0.4s}, [x5], #16
 
     fmla v8.4s, v0.4s, v5.s[1]
@@ -511,11 +555,13 @@ LoopHRemain:
 
 
 End:
+ldp x23, x24, [sp, #96]
+ldp x21, x22, [sp, #80]
 ldp x19, x20, [sp, #64]
 ldp d8,  d9,  [sp, #48]
 ldp d10, d11, [sp, #32]
 ldp d12, d13, [sp, #16]
-ldp d14, d15, [sp], #80
+ldp d14, d15, [sp], #112
 
 ret
 
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int8.S b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int8.S
index 346cba3c5..e4716e99f 100644
--- a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int8.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int8.S
@@ -11,23 +11,33 @@
 
 .text
 .align 5
+
+.macro MNN_ADD_FLAOT s0, s1, s2, s3, z0, z1, z2, z3
+    fadd \s0\().4s, \s0\().4s, \z0\().4s
+    fadd \s1\().4s, \s1\().4s, \z1\().4s
+    fadd \s2\().4s, \s2\().4s, \z2\().4s
+    fadd \s3\().4s, \s3\().4s, \z3\().4s
+
+.endm
 // 12 * 8 MatMul
 asm_function MNNPackedMatMul_int8
 //void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias);
 // x0: C, x1:A, x2:B, x3:parameter, x4: postParameters, x5:bias, x6: k, x7: b
-stp d14, d15, [sp, #-80]!
+stp d14, d15, [sp, #-96]!
 stp d12, d13, [sp, #16]
 stp d10, d11, [sp, #32]
 stp d8,  d9,  [sp, #48]
 stp x19, x20, [sp, #64]
+stp x21, x22, [sp, #80]
 
 //ldr x8, [x3, #0] // deprecated
 ldr x9, [x3, #8] // l
 ldr x10, [x3, #16] // h
 
 ldr x13, [x3, #24] // cStride
-ldr x11, [x3, #40] // bExtraStride
-
+ldr x11, [x3, #40] // bExtraStride = (LSize - l) * hP
+ldr x21, [x3, #48] // blockId
+//add x11, x11, x9, LSL #3  // bStride = hP * l + bExtraStride
 // v0, v1, v2: A
 // v3, v4: B
 // v8 - v31: C
@@ -45,7 +55,7 @@ cmp x10, #2
 blt LH4
 
 LH8:
-// sub x14, x13, #160
+sub x14, x13, #128
 LoopH:
 
     mov x15, x1
@@ -237,9 +247,26 @@ LoopH:
     sub x10, x10, #2
     cmp x10, #2
 
-    cbz x4, StoreLH8
+    cbz x21, AddBiasLH8
+    // add dst value
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+    MNN_ADD_FLAOT v8, v9, v10, v11, v0, v1, v2, v3
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+    MNN_ADD_FLAOT v12, v13, v14, v15, v0, v1, v2, v3
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], x14
+    MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3
+
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+    MNN_ADD_FLAOT v20, v21, v22, v23, v0, v1, v2, v3
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+    MNN_ADD_FLAOT v24, v25, v26, v27, v0, v1, v2, v3
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0]
+    MNN_ADD_FLAOT v28, v29, v30, v31, v0, v1, v2, v3
+    sub x0, x0, #256
+    sub x0, x0, x14
 
     AddBiasLH8:
+    cbz x4, StoreLH8
     ld1 {v5.4s}, [x4]
     dup v6.4s, v5.s[2] // Min Value
     dup v7.4s, v5.s[3] // Max Value
@@ -460,8 +487,19 @@ LoopHRemain:
 
     LoopLREnd:
 
-    cbz x4, StoreLH4
+    cbz x21, AddBiasLH4
+    // add dst value
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+    MNN_ADD_FLAOT v8, v9, v10, v11, v0, v1, v2, v3
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
+    MNN_ADD_FLAOT v12, v13, v14, v15, v0, v1, v2, v3
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0]
+    MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3
+
+    sub x0, x0, #128
+
     AddBiasLH4:
+    cbz x4, StoreLH4
     ld1 {v5.4s}, [x4]
     dup v6.4s, v5.s[2] // Min Value
     dup v7.4s, v5.s[3] // Max Value
@@ -524,11 +562,12 @@ LoopHRemain:
 
 
 End:
+ldp x21, x22, [sp, #80]
 ldp x19, x20, [sp, #64]
 ldp d8,  d9,  [sp, #48]
 ldp d10, d11, [sp, #32]
 ldp d12, d13, [sp, #16]
-ldp d14, d15, [sp], #80
+ldp d14, d15, [sp], #96
 
 ret
 
diff --git a/source/backend/cpu/compute/CommonOptFunction.cpp b/source/backend/cpu/compute/CommonOptFunction.cpp
index 72983f925..f9ce9567c 100644
--- a/source/backend/cpu/compute/CommonOptFunction.cpp
+++ b/source/backend/cpu/compute/CommonOptFunction.cpp
@@ -588,21 +588,15 @@ static void _MNNPackedMatMulRemain_int4(float* C, const float* A, const float* f
     auto hRemain = parameter[4];
     float weightBytes = 0.5; // sizeof(int4_t)
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride = bExtraStride + l * 4;
+    auto bStride = bExtraStride + 4 * l;
     auto hC4 = UP_DIV(h, 4);
-    for (int y=0; y<hC4; ++y) {
-        ::memset(C + y * cStride, 0, eSize * 4 * sizeof(float));
-    }
-    float alpha = 1.0f;
-    float beta = 0.0f;
     float minValue = -std::numeric_limits<float>().max();
     float maxValue = std::numeric_limits<float>().max();
     if (nullptr != postParameters) {
         minValue = postParameters[2];
         maxValue = postParameters[3];
-        alpha = postParameters[0];
-        beta = postParameters[1];
     }
+    int blockId = parameter[6];
 
     for (int x=0; x<eSize; ++x) {
         auto dst = C + 4 * x;
@@ -618,9 +612,15 @@ static void _MNNPackedMatMulRemain_int4(float* C, const float* A, const float* f
                 0.0f,
                 0.0f,
             };
-            if (nullptr != bias) {
+            if (blockId > 0) {
+                summer[0] = dstY[0];
+                summer[1] = dstY[1];
+                summer[2] = dstY[2];
+                summer[3] = dstY[3];
+            }
+            if (nullptr != bias && nullptr != postParameters) {
                 for (int v=0; v<4; ++v) {
-                    summer[v] = bias[4 * y + v];
+                    summer[v] += bias[4 * y + v];
                 }
             }
             for (int z=0; z<l; ++z) {
@@ -662,21 +662,15 @@ static void _MNNPackedMatMulRemain_int8(float* C, const float* A, const float* f
     auto hRemain = parameter[4];
     float weightBytes = 1; // sizeof(int8_t)
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride = bExtraStride + l * 4;
+    auto bStride = bExtraStride + 4 * l;
     auto hC4 = UP_DIV(h, 4);
-    for (int y=0; y<hC4; ++y) {
-        ::memset(C + y * cStride, 0, eSize * 4 * sizeof(float));
-    }
-    float alpha = 1.0f;
-    float beta = 0.0f;
     float minValue = -std::numeric_limits<float>().max();
     float maxValue = std::numeric_limits<float>().max();
     if (nullptr != postParameters) {
         minValue = postParameters[2];
         maxValue = postParameters[3];
-        alpha = postParameters[0];
-        beta = postParameters[1];
     }
+    int blockId = parameter[6];
 
     for (int x=0; x<eSize; ++x) {
         auto dst = C + 4 * x;
@@ -692,9 +686,15 @@ static void _MNNPackedMatMulRemain_int8(float* C, const float* A, const float* f
                 0.0f,
                 0.0f,
             };
-            if (nullptr != bias) {
+            if (blockId > 0) {
+                summer[0] = dstY[0];
+                summer[1] = dstY[1];
+                summer[2] = dstY[2];
+                summer[3] = dstY[3];
+            }
+            if (nullptr != bias && nullptr != postParameters) {
                 for (int v=0; v<4; ++v) {
-                    summer[v] = bias[4 * y + v];
+                    summer[v] += bias[4 * y + v];
                 }
             }
             for (int z=0; z<l; ++z) {
@@ -707,7 +707,7 @@ static void _MNNPackedMatMulRemain_int8(float* C, const float* A, const float* f
                     wZ[2]       = i8wZ[2] * alpha[2] + qbias[2];
                     wZ[3]       = i8wZ[3] * alpha[3] + qbias[3];
                 }
-                summer[0] += (i8wZ[0] * alpha[0] + qbias[0]) * aZ[0];
+                summer[0] += wZ[0] * aZ[0];
                 summer[1] += wZ[1] * aZ[0];
                 summer[2] += wZ[2] * aZ[0];
                 summer[3] += wZ[3] * aZ[0];
diff --git a/source/backend/cpu/compute/ConvolutionHybrid.cpp b/source/backend/cpu/compute/ConvolutionHybrid.cpp
index 2431bd30d..bf4f24c31 100644
--- a/source/backend/cpu/compute/ConvolutionHybrid.cpp
+++ b/source/backend/cpu/compute/ConvolutionHybrid.cpp
@@ -64,8 +64,8 @@ bool ConvolutionHybrid::initQuantizeResource(std::shared_ptr<ConvolutionCommon::
             if (int8Info->canUseInt4) {
                 for (int i = 0; i < h; ++i) {
                     int8Info->alpha.get()[i] *= -8.0;
-                    core->MNNFp32ToLowp(int8Info->alpha.get(), reinterpret_cast<int16_t*>(biasPtr), h);
                 }
+                core->MNNFp32ToLowp(int8Info->alpha.get(), reinterpret_cast<int16_t*>(biasPtr), h);
             }
         }
     } else {
@@ -86,6 +86,19 @@ bool ConvolutionHybrid::initQuantizeResource(std::shared_ptr<ConvolutionCommon::
             }
         }
     }
+    std::vector<int8_t> data(weightLength, 0);
+    auto srcWInt8 = int8Info->weight.get();
+    if (hP * hU != outputCount || lP * lU != srcChannel) {
+        int packedic = lU * lP;
+        for (int i = 0; i < outputCount; ++i) {
+            for (int j = 0; j < srcChannel; ++j) {
+                int destIdx = i * packedic + j;
+                int srcIdx = i * srcChannel + j;
+                data[destIdx] = srcWInt8[srcIdx];
+            }
+        }
+        srcWInt8 = data.data();
+    }
     if (int8Info->canUseInt4) {
         MNN_ASSERT(weightLength % 2 == 0);
         weightLength = UP_DIV(weightLength, 2);
@@ -101,10 +114,10 @@ bool ConvolutionHybrid::initQuantizeResource(std::shared_ptr<ConvolutionCommon::
                         for (int n = 0; n < 16; n++) {
                             int hp_idx = n / 8;
                             int lp_idx = n % 8;
-                            int s0 = srcPtr[(i * hP + k * 4 + hp_idx) * srcChannel + (j * lP + lp_idx)];
-                            int s1 = srcPtr[(i * hP + k * 4 + hp_idx + 2) * srcChannel + (j * lP + lp_idx)];
+                            int s0 = srcWInt8[(i * hP + k * 4 + hp_idx) * lP *lU + (j * lP + lp_idx)];
+                            int s1 = srcWInt8[(i * hP + k * 4 + hp_idx + 2) * lP * lU + (j * lP + lp_idx)];
                             int d = (s0 + 8) * 16 + (s1 + 8);
-                            dstPtr[(i * srcChannel * hP + j * hP * lP + k * 32) / 2 + n] = (uint8_t)d;
+                            dstPtr[(i * lU * lP * hP + j * hP * lP + k * 32) / 2 + n] = (uint8_t)d;
                         }
                     }
                 }
@@ -114,10 +127,10 @@ bool ConvolutionHybrid::initQuantizeResource(std::shared_ptr<ConvolutionCommon::
                 for (int j = 0; j < lU; j++) {
                     for (int k = 0; k < hP; k++) {
                         for (int l = 0; l < lP; l+=2) {
-                            int s0 = srcPtr[(i * hP + k) * srcChannel + (j * lP + l)];
-                            int s1 = srcPtr[(i * hP + k) * srcChannel + (j * lP + l + 1)];
+                            int s0 = srcWInt8[(i * hP + k) * lP * lU + (j * lP + l)];
+                            int s1 = srcWInt8[(i * hP + k) * lP * lU + (j * lP + l + 1)];
                             int d = (s0 + 8) * 16 + (s1 + 8);
-                            dstPtr[(i * srcChannel * hP + j * hP * lP + k * lP + l) / 2] = d;
+                            dstPtr[(i * lU * lP * hP + j * hP * lP + k * lP + l) / 2] = d;
                         }
                     }
                 }
@@ -126,13 +139,12 @@ bool ConvolutionHybrid::initQuantizeResource(std::shared_ptr<ConvolutionCommon::
     } else {
         // Reorder weight for int8
         auto dstWInt8 = resource->mWeight->host<int8_t>();
-        auto srcWInt8 = int8Info->weight.get();
         // oc, ic -> oc/hP, ic/lP, hP, lP
         for (int i = 0; i < hU; i++) {
             for (int j = 0; j < lU; j++) {
                 for (int k = 0; k < hP; k++) {
                     for (int l = 0; l < lP; l++) {
-                        dstWInt8[i * srcChannel * hP + j * hP * lP + k * lP + l] = srcWInt8[(i * hP + k) * srcChannel + (j * lP + l)];
+                        dstWInt8[i * lU * lP * hP + j * hP * lP + k * lP + l] = srcWInt8[(i * hP + k) * lP * lU + (j * lP + l)];
                     }
                 }
             }
diff --git a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
index 10df5d863..f844d7d5b 100644
--- a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
+++ b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
@@ -17,7 +17,7 @@
 #include "math/Vec.hpp"
 #include "core/BufferAllocator.hpp"
 #include "core/MemoryFormater.h"
-#define PARAMETERSIZE 6
+#define PARAMETERSIZE 7
 
 using Vec4 = MNN::Math::Vec<float, 4>;
 namespace MNN {
@@ -60,7 +60,15 @@ bool DenseConvolutionTiledExecutor::initQuantizeResource(std::shared_ptr<Convolu
         }
     }
     // Save scale bias
-    resource->mDequantize.mScaleBias.reset(MNN::Tensor::createDevice<float>({hU * hP * 2}));
+    int dequantCnt = int8Info->alpha.size();
+    int scaleSize = dequantCnt; // real size
+    if (int8Info->asymmetric) {
+        scaleSize = dequantCnt / 2;
+        
+    }
+    int blockNum = scaleSize / outputCount;
+    scaleSize = blockNum * hU * hP; // pack size
+    resource->mDequantize.mScaleBias.reset(MNN::Tensor::createDevice<uint8_t>({scaleSize * 2 * bytes}));
     res = resource->backend->onAcquireBuffer(resource->mDequantize.mScaleBias.get(), Backend::STATIC);
     if (!res) {
         return false;
@@ -88,36 +96,56 @@ bool DenseConvolutionTiledExecutor::initQuantizeResource(std::shared_ptr<Convolu
         resource->mWeight = weightLow;
     }
     auto alphaPtr = resource->mDequantize.mScaleBias->host<float>();
-    auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(alphaPtr) + hU * hP * bytes);
-    ::memset(alphaPtr, 0, 2 * hU * hP * bytes);
+    auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(alphaPtr) + scaleSize * bytes);
+    ::memset(alphaPtr, 0, 2 * scaleSize * bytes);
     int h = int8Info->alpha.size();
     if (bytes == 2) {
         auto core = static_cast<CPUBackend*>(resource->backend)->functions();
-        std::vector<float> tmpAlpha(hU*hP*2, 0.0f);
+        std::vector<float> tmpAlpha(scaleSize * 2, 0.0f);
         if (int8Info->asymmetric) {
-            int hh = h / 2;
-            for (int i=0; i<hh; ++i) {
-                tmpAlpha[i] = int8Info->alpha.get()[2 * i + 1];
-                tmpAlpha[i+hU*hP] = int8Info->alpha.get()[2 * i] + (float)originOffset * int8Info->alpha.get()[2 * i + 1];
+            for (int i = 0; i < blockNum; ++i) {
+                auto dstAlpha = tmpAlpha.data() + i * hU * hP;
+                auto srcAlpha = int8Info->alpha.get();
+                for (int j = 0; j < outputCount; ++j) {
+                    int scaleIndex = j * blockNum + i;
+                    dstAlpha[j] = srcAlpha[2 * scaleIndex + 1];
+                    dstAlpha[j + scaleSize] = srcAlpha[2 * scaleIndex] + (float)originOffset * dstAlpha[j];
+                }
             }
         } else {
-            for (int i=0; i<h; ++i) {
-                tmpAlpha[i] = int8Info->alpha.get()[i];
-                tmpAlpha[i+hU*hP] = (float)originOffset * int8Info->alpha.get()[i];
+            for (int i = 0; i < blockNum; ++i) {
+                auto dstAlpha = tmpAlpha.data() + i * hU * hP;
+                auto srcAlpha = int8Info->alpha.get();
+                for (int j = 0; j < outputCount; ++j) {
+                    int scaleIndex = j * blockNum + i;
+                    dstAlpha[j] = srcAlpha[scaleIndex];
+                    dstAlpha[j + scaleSize] = (float)originOffset * dstAlpha[j];
+                }
             }
         }
-        core->MNNFp32ToLowp(tmpAlpha.data(), reinterpret_cast<int16_t*>(alphaPtr), hU*hP*2);
+        core->MNNFp32ToLowp(tmpAlpha.data(), reinterpret_cast<int16_t*>(alphaPtr), scaleSize * 2);
     } else {
         if (int8Info->asymmetric) {
-            int hh = h / 2;
-            for (int i=0; i<hh; ++i) {
-                alphaPtr[i] = int8Info->alpha.get()[2 * i + 1];
-                biasPtr[i] = int8Info->alpha.get()[2 * i] + (float)originOffset * alphaPtr[i];
+            for (int i = 0; i < blockNum; ++i) {
+                auto dstAlpha = alphaPtr + i * hU * hP;
+                auto dstBias  = biasPtr + i * hU * hP;
+                auto srcAlpha = int8Info->alpha.get();
+                for (int j = 0; j < outputCount; ++j) {
+                    int scaleIndex = j * blockNum + i;
+                    dstAlpha[j] = srcAlpha[2 * scaleIndex + 1];
+                    dstBias[j] = srcAlpha[2 * scaleIndex] + (float)originOffset * dstAlpha[j];
+                }
             }
         } else {
-            for (int i=0; i<h; ++i) {
-                alphaPtr[i] = int8Info->alpha.get()[i];
-                biasPtr[i] = 0.f + (float)originOffset * alphaPtr[i];
+            for (int i = 0; i < blockNum; ++i) {
+                auto dstAlpha = alphaPtr + i * hU * hP;
+                auto dstBias  = biasPtr + i * hU * hP;
+                auto srcAlpha = int8Info->alpha.get();
+                for (int j = 0; j < outputCount; ++j) {
+                    int scaleIndex = j * blockNum + i;
+                    dstAlpha[j] = srcAlpha[scaleIndex];
+                    dstBias[j] = (float)originOffset * dstAlpha[j];
+                }
             }
         }
     }
@@ -435,11 +463,27 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
     auto weightType = weight->getType();
     const uint8_t* dequantAlpha = nullptr;
     const uint8_t* dequantBias = nullptr;
+    auto ic       = input->channel();
+    auto icC4     = UP_DIV(ic, unit);
+    auto L        = ic * mCommon->kernelY() * mCommon->kernelX();
+    auto tileC    = std::max(unit, hP);
+    int blockSize = L;
+    int blockNum  = 1;
+    float halfStride = 1;
+    size_t weightStride = 0;
 #ifdef MNN_LOW_MEMORY
     if (mResource && mResource->mDequantize.bits <= 8) {
         DenseConvolutionTiledExecutor::selectLowMemoryMatmulFunc(&matmulUnit, &matmulRemain, &weightBytes, mResource->mDequantize.bits, core);
+        int scaleSize = mResource->mDequantize.mScaleBias->size() / (2 * bytes);
+        blockNum = scaleSize / (mResource->hU * mResource->hP);
+        blockSize /= blockNum;
         dequantAlpha = mResource->mDequantize.mScaleBias->host<uint8_t>();
-        dequantBias = dequantAlpha + mResource->hU * mResource->hP * bytes;
+        dequantBias = dequantAlpha + scaleSize * bytes;
+        weightStride = (L - blockSize) * hP;
+        if (mResource->mDequantize.bits == 4) {
+            halfStride = 0.5;
+            weightStride = static_cast<size_t>(weightStride * halfStride);
+        }
     }
 #endif
     auto kernel_width      = mCommon->kernelX();
@@ -447,14 +491,12 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
     auto output      = outputs[0];
     auto batch       = output->batch();
     int threadNumber = ((CPUBackend *)backend())->threadNumber();
-    auto icC4                     = UP_DIV(input->channel(), unit);
-    auto ic                       = input->channel();
-    auto L                        = ic * mCommon->kernelY() * mCommon->kernelX();
+    
     int  LRoundup = ROUND_UP(L, lP);
     int  LRoundupC4 = UP_DIV(LRoundup, unit);
     auto outputChannel = output->channel();
-    auto tileC    = std::max(unit, hP);
     auto oC4      = UP_DIV(outputChannel, tileC);
+    auto ocUp4    = ROUND_UP(outputChannel, hP);
     auto kernelSize               = mCommon->kernelX() * mCommon->kernelY();
 
     ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParameters, mCommon, input, output, mPadX, mPadY, core, nullptr);
@@ -507,11 +549,12 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
         size_t shapeParameters[PARAMETERSIZE];
         size_t* parameters = shapeParameters;
         parameters[0]          = eP * bytes;
-        parameters[1]          = L;
+        parameters[1]          = blockSize;
         parameters[2]          = outputChannel;
         parameters[3]          = plane * unit * bytes;
         parameters[4]          = 0;
-        parameters[5]          = 0;
+        parameters[5]          = weightStride; // Only used when block quant
+        parameters[6]          = 0;
 
 #ifdef PROFILE_DETAIL
         std::vector<uint64_t> durationMul(threadNumberFirst, 0);
@@ -572,9 +615,24 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
                         auto _weightFloatPtr = reinterpret_cast<const float*>(weightPtr + int((ocIndex / hP * LRoundup * hP) * weightBytes));
                         auto _biasFloatPtr = reinterpret_cast<const float*>(reinterpret_cast<const uint8_t*>(biasPtr) + ocIndex * bytes);
                         paraParameters[2] = std::min(outputChannel - ocIndex, tileC);
-                        auto k = reinterpret_cast<const float*>(dequantAlpha + ocIndex * bytes);
-                        auto b = reinterpret_cast<const float*>(dequantBias + ocIndex * bytes);
-                        matmulUnit(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, paraParameters, postParameters.data(), _biasFloatPtr, k, b);
+                        auto k = reinterpret_cast<const uint8_t*>(dequantAlpha + ocIndex * bytes);
+                        auto b = reinterpret_cast<const uint8_t*>(dequantBias + ocIndex * bytes);
+                        const float* relufp32 = nullptr;
+                        const float* exeBiasPtr = nullptr;
+                        int finishedL = 0;
+                        int wquantStride = 0;
+                        auto _weightPtr = reinterpret_cast<const int8_t*>(_weightFloatPtr);
+                        uint8_t*  _APtr      = reinterpret_cast<uint8_t*>(gemmBuffer);
+                        for (int bk = 0; bk < blockNum; ++bk) {
+                            paraParameters[6] = bk;
+                            if (bk == blockNum - 1) {
+                                relufp32 = postParameters.data();
+                                exeBiasPtr = _biasFloatPtr;
+                            }
+                            finishedL = blockSize * bk;
+                            wquantStride = static_cast<int32_t>(blockSize * bk * hP * halfStride);
+                            matmulUnit(_dstFloatPtr, (float*)(_APtr + eP * finishedL * bytes), (float*)(_weightPtr + wquantStride), paraParameters, relufp32, exeBiasPtr, (float*)(k + bk * ocUp4 * bytes), (float*)(b + bk * ocUp4 * bytes));
+                        }
                     }
                 }
                 MNN_CONCURRENCY_END();
@@ -588,9 +646,24 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
                         auto _weightFloatPtr = reinterpret_cast<const float*>(weightPtr + int((ocIndex / hP * LRoundup * hP) * weightBytes));
                         auto _biasFloatPtr = reinterpret_cast<const float*>(reinterpret_cast<const uint8_t*>(biasPtr) + ocIndex * bytes);
                         paraParameters[2] = std::min(outputChannel - ocIndex, tileC);
-                        auto k = reinterpret_cast<const float*>(dequantAlpha + ocIndex * bytes);
-                        auto b = reinterpret_cast<const float*>(dequantBias + ocIndex * bytes);
-                        matmulRemain(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, xC, paraParameters, postParameters.data(), _biasFloatPtr, k, b);
+                        auto k = reinterpret_cast<const uint8_t*>(dequantAlpha + ocIndex * bytes);
+                        auto b = reinterpret_cast<const uint8_t*>(dequantBias + ocIndex * bytes);
+                        const float* relufp32 = nullptr;
+                        const float* exeBiasPtr = nullptr;
+                        int finishedL = 0;
+                        int wquantStride = 0;
+                        const int8_t* _weightPtr = reinterpret_cast<const int8_t*>(_weightFloatPtr);
+                        uint8_t*  _APtr      = reinterpret_cast<uint8_t*>(gemmBuffer);
+                        for (int bk = 0; bk < blockNum; ++bk) {
+                            paraParameters[6] = bk;
+                            if (bk == blockNum - 1) {
+                                relufp32 = postParameters.data();
+                                exeBiasPtr = _biasFloatPtr;
+                            }
+                            finishedL = blockSize * bk;
+                            wquantStride = static_cast<int32_t>(blockSize * bk * hP * halfStride);
+                            matmulRemain(_dstFloatPtr, (float*)(_APtr + eP * finishedL * bytes), (float*)(_weightPtr + wquantStride), xC, paraParameters, relufp32, exeBiasPtr, (float*)(k + bk * ocUp4 * bytes), (float*)(b + bk * ocUp4 * bytes));
+                        }
                     }
                 }
                 MNN_CONCURRENCY_END();
@@ -633,11 +706,12 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
             info[3] = mIm2ColParameters.strideX;
             size_t parameters[PARAMETERSIZE];
             parameters[0]          = eP * bytes;
-            parameters[1]          = L;
+            parameters[1]          = blockSize;
             parameters[2]          = outputChannel;
             parameters[3]          = plane * unit * bytes;
             parameters[4]          = 0;
-            parameters[5]          = 0;
+            parameters[5]          = weightStride; // Only used when block quant
+            parameters[6]          = 0;
 
 #ifdef PROFILE_DETAIL
             std::vector<uint64_t> durationMul(threadNumberFirst, 0);
@@ -673,13 +747,38 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
                 packATime[tId] += timer[tId].durationInUs();
                 timer[tId].reset();
 #endif
-                auto k =  reinterpret_cast<const float*>(dequantAlpha);
-                auto b =  reinterpret_cast<const float*>(dequantBias);
+                int finishedL = 0;
+                int wquantStride = 0;
+                int8_t* _weightPtr = reinterpret_cast<int8_t*>(weightPtr);
                 auto _dstFloatPtr = reinterpret_cast<float*>(dstOrigin + start * unit * bytes);
+                const float* relufp32 = nullptr;
+                const float* exeBiasPtr = nullptr;
                 if (xC == eP) {
-                    matmulUnit(_dstFloatPtr, (float*)gemmBuffer, (float*)weightPtr, parameters, postParameters.data(), biasPtr, k, b);
+                    // matmulUnit(_dstFloatPtr, (float*)gemmBuffer, (float*)weightPtr, parameters, postParameters.data(), biasPtr, k, b);
+                    for (int bk = 0; bk < blockNum; ++bk) {
+                        parameters[6] = bk;
+                        if (bk == blockNum - 1) {
+                            relufp32 = postParameters.data();
+                            exeBiasPtr = biasPtr;
+                        }
+                        finishedL = blockSize * bk;
+                        wquantStride = static_cast<int32_t>(blockSize * bk * hP * halfStride);
+                        
+                        matmulUnit(_dstFloatPtr, (float*)(gemmBuffer + bytes * eP * finishedL), (float*)(_weightPtr + wquantStride), parameters, relufp32, exeBiasPtr, (float*)(dequantAlpha + bk * ocUp4 * bytes), (float*)(dequantBias + bk * ocUp4 * bytes));
+                    }
                 } else {
-                    matmulRemain(_dstFloatPtr, (float*)gemmBuffer, (float*)weightPtr, xC, parameters, postParameters.data(), biasPtr, k, b);
+                    for (int bk = 0; bk < blockNum; ++bk) {
+                        parameters[6] = bk;
+                        if (bk == blockNum - 1) {
+                            relufp32 = postParameters.data();
+                            exeBiasPtr = biasPtr;
+                        }
+                        finishedL = blockSize * bk;
+                        wquantStride = static_cast<int32_t>(blockSize * bk * hP * halfStride);
+                        
+                        matmulRemain(_dstFloatPtr, (float*)(gemmBuffer + eP * bytes * finishedL), (float*)(_weightPtr + wquantStride), xC, parameters, relufp32, exeBiasPtr, (float*)(dequantAlpha + bk * ocUp4 * bytes), (float*)(dequantBias + bk * ocUp4 * bytes ));
+                    }
+                    // matmulRemain(_dstFloatPtr, (float*)gemmBuffer, (float*)weightPtr, xC, parameters, postParameters.data(), biasPtr, k, b);
                 }
 
 #ifdef PROFILE_DETAIL
diff --git a/source/backend/cpu/compute/StrassenMatmulComputor.cpp b/source/backend/cpu/compute/StrassenMatmulComputor.cpp
index 04407b60c..094efbd5a 100644
--- a/source/backend/cpu/compute/StrassenMatmulComputor.cpp
+++ b/source/backend/cpu/compute/StrassenMatmulComputor.cpp
@@ -94,13 +94,14 @@ ErrorCode StrassenMatrixComputor::_generateTrivalMatMul(int e, int l, int h, con
     mFunctions.emplace_back(
         std::make_pair([cStride, l, h, xCount, AT, BT, CT, COT, tileBufferBasic, unitNumber, bExtraStride, numberThread, eReal, eP, active, matmulUnit, matmulRemain, dequantAlpha, dequantBias, this](int tId) {
             auto core = static_cast<CPUBackend*>(backend())->functions();
-            size_t parameters[6];
+            size_t parameters[7];
             parameters[0] = xCount * core->bytes;
             parameters[1] = l;
             parameters[2] = h;
             parameters[3] = cStride;
             parameters[4] = 0;
             parameters[5] = bExtraStride;
+            parameters[6] = 0;
             auto tileHost = tileBufferBasic.ptr() + eP * parameters[1] * tId * core->bytes;
             const float* postParametersPtr = nullptr;
             if (!active.empty()) {
diff --git a/source/backend/cpu/x86_x64/avx/GemmAVX2.cpp b/source/backend/cpu/x86_x64/avx/GemmAVX2.cpp
index 56eaf70fb..2214e1688 100644
--- a/source/backend/cpu/x86_x64/avx/GemmAVX2.cpp
+++ b/source/backend/cpu/x86_x64/avx/GemmAVX2.cpp
@@ -35,22 +35,30 @@ void _AVX_MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t
 void _AVX_MNNPackedMatMul_int4(float* C, const float* A, const float* B, const size_t* parameter,
                                const float* postParameters, const float* bias, const float* k, const float* b) {
     _AVX_MNNPackedMatMul_Main_int4(C, A, B, parameter, k, b);
-    AVX2GemmPostTreat(C, MNN_UNIT_E, parameter, postParameters, bias);
+    if (nullptr != bias) {
+        AVX2GemmPostTreat(C, MNN_UNIT_E, parameter, postParameters, bias);
+    }
 }
 void _AVX_MNNPackedMatMulRemain_int4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
                              const float* postParameters, const float* bias, const float* k, const float* b) {
     _AVX_MNNPackednMatMulRemainCommon_int4(C, A, B, eSize, parameter, k, b);
-    AVX2GemmPostTreat(C, eSize, parameter, postParameters, bias);
+    if (nullptr != bias) {
+        AVX2GemmPostTreat(C, eSize, parameter, postParameters, bias);
+    }
 }
 void _AVX_MNNPackedMatMul_int8(float* C, const float* A, const float* B, const size_t* parameter,
                                const float* postParameters, const float* bias, const float* k, const float* b) {
     _AVX_MNNPackedMatMul_Main_int8(C, A, B, parameter, k, b);
-    AVX2GemmPostTreat(C, MNN_UNIT_E, parameter, postParameters, bias);
+    if (nullptr != bias) {
+        AVX2GemmPostTreat(C, MNN_UNIT_E, parameter, postParameters, bias);
+    }
 }
 void _AVX_MNNPackedMatMulRemain_int8(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
                              const float* postParameters, const float* bias, const float* k, const float* b) {
     _AVX_MNNPackednMatMulRemainCommon_int8(C, A, B, eSize, parameter, k, b);
-    AVX2GemmPostTreat(C, eSize, parameter, postParameters, bias);
+    if (nullptr != bias) {
+        AVX2GemmPostTreat(C, eSize, parameter, postParameters, bias);
+    }
 }
 static __m128i _load_int4_to_int8(const uint8_t* src) {
     uint8_t c = 0xf;
diff --git a/source/backend/cpu/x86_x64/avx/GemmFunction.hpp b/source/backend/cpu/x86_x64/avx/GemmFunction.hpp
index 08dfdfef9..bf299722c 100644
--- a/source/backend/cpu/x86_x64/avx/GemmFunction.hpp
+++ b/source/backend/cpu/x86_x64/avx/GemmFunction.hpp
@@ -20,6 +20,27 @@
         STORE_4(dst + 8 * (3 + 4 * u + 8 * v), m3); \
     }
 
+#define FMLA_TRANSPOSE_SAVE(u, v, z0, z3, z6, z9) \
+    { \
+        auto tmp_m0 = LOAD4(dst + 8 * (0 + 4 * u + 8 * v)); \
+        auto tmp_m1 = LOAD4(dst + 8 * (1 + 4 * u + 8 * v)); \
+        auto tmp_m2 = LOAD4(dst + 8 * (2 + 4 * u + 8 * v)); \
+        auto tmp_m3 = LOAD4(dst + 8 * (3 + 4 * u + 8 * v)); \
+        auto m0 = _mm256_extractf128_ps(z0, u);             \
+        auto m1 = _mm256_extractf128_ps(z3, u);             \
+        auto m2 = _mm256_extractf128_ps(z6, u);             \
+        auto m3 = _mm256_extractf128_ps(z9, u);             \
+        _MM_TRANSPOSE4_PS(m0, m1, m2, m3);                  \
+        m0 = _mm_add_ps(tmp_m0, m0);                    \
+        m1 = _mm_add_ps(tmp_m1, m1);                    \
+        m2 = _mm_add_ps(tmp_m2, m2);                    \
+        m3 = _mm_add_ps(tmp_m3, m3);                    \
+        STORE_4(dst + 8 * (0 + 4 * u + 8 * v), m0);     \
+        STORE_4(dst + 8 * (1 + 4 * u + 8 * v), m1);     \
+        STORE_4(dst + 8 * (2 + 4 * u + 8 * v), m2);     \
+        STORE_4(dst + 8 * (3 + 4 * u + 8 * v), m3);     \
+    }
+
 namespace {
 static inline __m128i mm_loadu_si128(const void* addr) {
     return _mm_castps_si128(LOAD4((const float*)addr));
@@ -858,9 +879,10 @@ static void _AVX_MNNPackedMatMul_Main_int4(TYPE* C, const TYPE* A, const TYPE* f
     auto cStride      = parameter[3] / sizeof(TYPE);
     float weightBytes = 0.5; // sizeof(int4_t)
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
     auto hC4          = UP_DIV(h, 4);
     float ws_tmp[4];
+    size_t blockId    = parameter[6];
     for (int y = 0; y < hC4; ++y) {
         auto weight = B + y * bStride / 2;
         auto dst    = C + (y / 2) * cStride + 4 * (y % 2);
@@ -911,12 +933,21 @@ static void _AVX_MNNPackedMatMul_Main_int4(TYPE* C, const TYPE* A, const TYPE* f
             z10 = MNNAVXFMA(s1, w3, z10);
             z11 = MNNAVXFMA(s2, w3, z11);
         }
-        TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
-        TRANPOSE_SAVE(1, 0, z0, z3, z6, z9);
-        TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
-        TRANPOSE_SAVE(1, 1, z1, z4, z7, z10);
-        TRANPOSE_SAVE(0, 2, z2, z5, z8, z11);
-        TRANPOSE_SAVE(1, 2, z2, z5, z8, z11);
+        if (blockId == 0) {
+            TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            TRANPOSE_SAVE(1, 0, z0, z3, z6, z9);
+            TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+            TRANPOSE_SAVE(1, 1, z1, z4, z7, z10);
+            TRANPOSE_SAVE(0, 2, z2, z5, z8, z11);
+            TRANPOSE_SAVE(1, 2, z2, z5, z8, z11);
+        } else {
+            FMLA_TRANSPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            FMLA_TRANSPOSE_SAVE(1, 0, z0, z3, z6, z9);
+            FMLA_TRANSPOSE_SAVE(0, 1, z1, z4, z7, z10);
+            FMLA_TRANSPOSE_SAVE(1, 1, z1, z4, z7, z10);
+            FMLA_TRANSPOSE_SAVE(0, 2, z2, z5, z8, z11);
+            FMLA_TRANSPOSE_SAVE(1, 2, z2, z5, z8, z11);
+        }
     }
 }
 
@@ -929,7 +960,8 @@ static void _AVX_MNNPackedMatMul_int4_20(TYPE* C, const TYPE* A, const uint8_t*
     auto cStride      = parameter[3] / sizeof(TYPE);
     float weightBytes = 0.5; // sizeof(int4_t)
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     float ws_tmp[4];
     for (int y = 0; y < hC4; ++y) {
@@ -981,11 +1013,19 @@ static void _AVX_MNNPackedMatMul_int4_20(TYPE* C, const TYPE* A, const uint8_t*
             z10 = MNNAVXFMA(s1, w3, z10);
             z11 = MNNAVXFMA(s2, w3, z11);
         }
-        TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
-        TRANPOSE_SAVE(1, 0, z0, z3, z6, z9);
-        TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
-        TRANPOSE_SAVE(1, 1, z1, z4, z7, z10);
-        TRANPOSE_SAVE(0, 2, z2, z5, z8, z11);
+        if (0 == blockId) {
+            TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            TRANPOSE_SAVE(1, 0, z0, z3, z6, z9);
+            TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+            TRANPOSE_SAVE(1, 1, z1, z4, z7, z10);
+            TRANPOSE_SAVE(0, 2, z2, z5, z8, z11);
+        } else {
+            FMLA_TRANSPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            FMLA_TRANSPOSE_SAVE(1, 0, z0, z3, z6, z9);
+            FMLA_TRANSPOSE_SAVE(0, 1, z1, z4, z7, z10);
+            FMLA_TRANSPOSE_SAVE(1, 1, z1, z4, z7, z10);
+            FMLA_TRANSPOSE_SAVE(0, 2, z2, z5, z8, z11);
+        }
     }
 }
 
@@ -997,7 +1037,8 @@ static void _AVX_MNNPackedMatMul_int4_16(TYPE* C, const TYPE* A, const uint8_t*
     auto cStride      = parameter[3] / sizeof(TYPE);
     float weightBytes = 0.5; // sizeof(int4_t)
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     float ws_tmp[4];
     for (int y = 0; y < hC4; ++y) {
@@ -1039,10 +1080,17 @@ static void _AVX_MNNPackedMatMul_int4_16(TYPE* C, const TYPE* A, const uint8_t*
             z9  = MNNAVXFMA(s0, w3, z9);
             z10 = MNNAVXFMA(s1, w3, z10);
         }
-        TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
-        TRANPOSE_SAVE(1, 0, z0, z3, z6, z9);
-        TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
-        TRANPOSE_SAVE(1, 1, z1, z4, z7, z10);
+        if (0 == blockId) {
+            TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            TRANPOSE_SAVE(1, 0, z0, z3, z6, z9);
+            TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+            TRANPOSE_SAVE(1, 1, z1, z4, z7, z10);
+        } else {
+            FMLA_TRANSPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            FMLA_TRANSPOSE_SAVE(1, 0, z0, z3, z6, z9);
+            FMLA_TRANSPOSE_SAVE(0, 1, z1, z4, z7, z10);
+            FMLA_TRANSPOSE_SAVE(1, 1, z1, z4, z7, z10);
+        }
     }
 }
 
@@ -1054,7 +1102,8 @@ static void _AVX_MNNPackedMatMul_int4_5(TYPE* C, const TYPE* A, const uint8_t* B
     auto cStride      = parameter[3] / sizeof(TYPE);
     float weightBytes = 0.5;
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     int lC4 = l / 4;
     int lR = lC4 * 4;
@@ -1115,17 +1164,53 @@ static void _AVX_MNNPackedMatMul_int4_5(TYPE* C, const TYPE* A, const uint8_t* B
             weight2 += 2;
             weight3 += 2;
         }
-        STORE_8(dst0, sumAvx00);
-        STORE_8(dst0 + 8, sumAvx10);
-        STORE_8(dst0 + 16, sumAvx20);
-        STORE_8(dst0 + 24, sumAvx30);
-        STORE_8(dst0 + 32, sumAvx40);
-
-        STORE_8(dst2, sumAvx01);
-        STORE_8(dst2 + 8, sumAvx11);
-        STORE_8(dst2 + 16, sumAvx21);
-        STORE_8(dst2 + 24, sumAvx31);
-        STORE_8(dst2 + 32, sumAvx41);
+        if (0 == blockId) {
+            STORE_8(dst0, sumAvx00);
+            STORE_8(dst0 + 8, sumAvx10);
+            STORE_8(dst0 + 16, sumAvx20);
+            STORE_8(dst0 + 24, sumAvx30);
+            STORE_8(dst0 + 32, sumAvx40);
+
+            STORE_8(dst2, sumAvx01);
+            STORE_8(dst2 + 8, sumAvx11);
+            STORE_8(dst2 + 16, sumAvx21);
+            STORE_8(dst2 + 24, sumAvx31);
+            STORE_8(dst2 + 32, sumAvx41);
+        } else {
+            auto tmp0 = LOAD8(dst0);
+            auto tmp1 = LOAD8(dst0 + 8);
+            auto tmp2 = LOAD8(dst0 + 16);
+            auto tmp3 = LOAD8(dst0 + 24);
+            auto tmp4 = LOAD8(dst0 + 32);
+            auto tmp5 = LOAD8(dst2);
+            auto tmp6 = LOAD8(dst2 + 8);
+            auto tmp7 = LOAD8(dst2 + 16);
+            auto tmp8 = LOAD8(dst2 + 24);
+            auto tmp9 = LOAD8(dst2 + 32);
+
+            sumAvx00 = _mm256_add_ps(sumAvx00, tmp0);
+            sumAvx10 = _mm256_add_ps(sumAvx10, tmp1);
+            sumAvx20 = _mm256_add_ps(sumAvx20, tmp2);
+            sumAvx30 = _mm256_add_ps(sumAvx30, tmp3);
+            sumAvx40 = _mm256_add_ps(sumAvx40, tmp4);
+            sumAvx01 = _mm256_add_ps(sumAvx01, tmp5);
+            sumAvx11 = _mm256_add_ps(sumAvx11, tmp6);
+            sumAvx21 = _mm256_add_ps(sumAvx21, tmp7);
+            sumAvx31 = _mm256_add_ps(sumAvx31, tmp8);
+            sumAvx41 = _mm256_add_ps(sumAvx41, tmp9);
+
+            STORE_8(dst0, sumAvx00);
+            STORE_8(dst0 + 8, sumAvx10);
+            STORE_8(dst0 + 16, sumAvx20);
+            STORE_8(dst0 + 24, sumAvx30);
+            STORE_8(dst0 + 32, sumAvx40);
+
+            STORE_8(dst2, sumAvx01);
+            STORE_8(dst2 + 8, sumAvx11);
+            STORE_8(dst2 + 16, sumAvx21);
+            STORE_8(dst2 + 24, sumAvx31);
+            STORE_8(dst2 + 32, sumAvx41);
+        }
     }
     for (int y = hR; y < hC4; ++y) {
         auto weight = B + y * bStride / 2;
@@ -1157,11 +1242,31 @@ static void _AVX_MNNPackedMatMul_int4_5(TYPE* C, const TYPE* A, const uint8_t* B
             z3 = MNNSSEFMA(s3, w0, z3);
             z4 = MNNSSEFMA(s4, w0, z4);
         }
-        STORE_4(dst + 8 * 0, z0);
-        STORE_4(dst + 8 * 1, z1);
-        STORE_4(dst + 8 * 2, z2);
-        STORE_4(dst + 8 * 3, z3);
-        STORE_4(dst + 8 * 4, z4);
+        if (0 == blockId) {
+            STORE_4(dst + 8 * 0, z0);
+            STORE_4(dst + 8 * 1, z1);
+            STORE_4(dst + 8 * 2, z2);
+            STORE_4(dst + 8 * 3, z3);
+            STORE_4(dst + 8 * 4, z4);
+        } else {
+            auto tmp0 = LOAD4(dst + 8 * 0);
+            auto tmp1 = LOAD4(dst + 8 * 1);
+            auto tmp2 = LOAD4(dst + 8 * 2);
+            auto tmp3 = LOAD4(dst + 8 * 3);
+            auto tmp4 = LOAD4(dst + 8 * 4);
+
+            z0 = _mm_add_ps(tmp0, z0);
+            z1 = _mm_add_ps(tmp1, z1);
+            z2 = _mm_add_ps(tmp2, z2);
+            z3 = _mm_add_ps(tmp3, z3);
+            z4 = _mm_add_ps(tmp4, z4);
+
+            STORE_4(dst + 8 * 0, z0);
+            STORE_4(dst + 8 * 1, z1);
+            STORE_4(dst + 8 * 2, z2);
+            STORE_4(dst + 8 * 3, z3);
+            STORE_4(dst + 8 * 4, z4);
+        }
     }
 }
 
@@ -1174,7 +1279,8 @@ static void _AVX_MNNPackedMatMul_int4_4(TYPE* C, const TYPE* A, const uint8_t* B
     auto cStride      = parameter[3] / sizeof(TYPE);
     float weightBytes = 0.5; // sizeof(int4_t)
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     int lC4 = l / 4;
     int lR = lC4 * 4;
@@ -1229,15 +1335,47 @@ static void _AVX_MNNPackedMatMul_int4_4(TYPE* C, const TYPE* A, const uint8_t* B
             weight2 += 2;
             weight3 += 2;
         }
-        STORE_8(dst0, sumAvx00);
-        STORE_8(dst0 + 8, sumAvx10);
-        STORE_8(dst0 + 16, sumAvx20);
-        STORE_8(dst0 + 24, sumAvx30);
-
-        STORE_8(dst2, sumAvx01);
-        STORE_8(dst2 + 8, sumAvx11);
-        STORE_8(dst2 + 16, sumAvx21);
-        STORE_8(dst2 + 24, sumAvx31);
+        if (0 == blockId) {
+            STORE_8(dst0, sumAvx00);
+            STORE_8(dst0 + 8, sumAvx10);
+            STORE_8(dst0 + 16, sumAvx20);
+            STORE_8(dst0 + 24, sumAvx30);
+
+            STORE_8(dst2, sumAvx01);
+            STORE_8(dst2 + 8, sumAvx11);
+            STORE_8(dst2 + 16, sumAvx21);
+            STORE_8(dst2 + 24, sumAvx31);
+        } else {
+            auto tmp0 = LOAD8(dst0);
+            auto tmp1 = LOAD8(dst0 + 8);
+            auto tmp2 = LOAD8(dst0 + 16);
+            auto tmp3 = LOAD8(dst0 + 24);
+
+            auto tmp5 = LOAD8(dst2);
+            auto tmp6 = LOAD8(dst2 + 8);
+            auto tmp7 = LOAD8(dst2 + 16);
+            auto tmp8 = LOAD8(dst2 + 24);
+
+            sumAvx00 = _mm256_add_ps(sumAvx00, tmp0);
+            sumAvx10 = _mm256_add_ps(sumAvx10, tmp1);
+            sumAvx20 = _mm256_add_ps(sumAvx20, tmp2);
+            sumAvx30 = _mm256_add_ps(sumAvx30, tmp3);
+
+            sumAvx01 = _mm256_add_ps(sumAvx01, tmp5);
+            sumAvx11 = _mm256_add_ps(sumAvx11, tmp6);
+            sumAvx21 = _mm256_add_ps(sumAvx21, tmp7);
+            sumAvx31 = _mm256_add_ps(sumAvx31, tmp8);
+
+            STORE_8(dst0, sumAvx00);
+            STORE_8(dst0 + 8, sumAvx10);
+            STORE_8(dst0 + 16, sumAvx20);
+            STORE_8(dst0 + 24, sumAvx30);
+
+            STORE_8(dst2, sumAvx01);
+            STORE_8(dst2 + 8, sumAvx11);
+            STORE_8(dst2 + 16, sumAvx21);
+            STORE_8(dst2 + 24, sumAvx31);
+        }
     }
     float ws_tmp[4];
     for (int y = hR; y < hC4; ++y) {
@@ -1271,10 +1409,28 @@ static void _AVX_MNNPackedMatMul_int4_4(TYPE* C, const TYPE* A, const uint8_t* B
             z9 = MNNSSEFMA(s0, w3, z9);
         }
         _MM_TRANSPOSE4_PS(z0, z3, z6, z9);
-        STORE_4(dst + 8 * 0, z0);
-        STORE_4(dst + 8 * 1, z3);
-        STORE_4(dst + 8 * 2, z6);
-        STORE_4(dst + 8 * 3, z9);
+        if (0 == blockId) {
+            STORE_4(dst + 8 * 0, z0);
+            STORE_4(dst + 8 * 1, z3);
+            STORE_4(dst + 8 * 2, z6);
+            STORE_4(dst + 8 * 3, z9);
+        } else {
+
+            auto tmp0 = LOAD4(dst + 8 * 0);
+            auto tmp1 = LOAD4(dst + 8 * 1);
+            auto tmp2 = LOAD4(dst + 8 * 2);
+            auto tmp3 = LOAD4(dst + 8 * 3);
+
+            z0 = _mm_add_ps(tmp0, z0);
+            z3 = _mm_add_ps(tmp1, z3);
+            z6 = _mm_add_ps(tmp2, z6);
+            z9 = _mm_add_ps(tmp3, z9);
+
+            STORE_4(dst + 8 * 0, z0);
+            STORE_4(dst + 8 * 1, z3);
+            STORE_4(dst + 8 * 2, z6);
+            STORE_4(dst + 8 * 3, z9);
+        }
     }
 }
 template <typename TYPE>
@@ -1285,7 +1441,8 @@ static void _AVX_MNNPackedMatMul_int4_3(TYPE* C, const TYPE* A, const uint8_t* B
     auto cStride      = parameter[3] / sizeof(TYPE);
     float weightBytes = 0.5; // sizeof(int4_t)
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     int lC4 = l / 4;
     int lR = lC4 * 4;
@@ -1333,21 +1490,78 @@ static void _AVX_MNNPackedMatMul_int4_3(TYPE* C, const TYPE* A, const uint8_t* B
             weight2 += 2;
             weight3 += 2;
         }
-        STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0));
-        STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx10, 0));
-        STORE_4(dst0 + 16, _mm256_extractf128_ps(sumAvx20, 0));
-
-        STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1));
-        STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx10, 1));
-        STORE_4(dst1 + 16, _mm256_extractf128_ps(sumAvx20, 1));
-
-        STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0));
-        STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx11, 0));
-        STORE_4(dst2 + 16, _mm256_extractf128_ps(sumAvx21, 0));
-
-        STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1));
-        STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx11, 1));
-        STORE_4(dst3 + 16, _mm256_extractf128_ps(sumAvx21, 1));
+        if (0 == blockId) {
+            STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0));
+            STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx10, 0));
+            STORE_4(dst0 + 16, _mm256_extractf128_ps(sumAvx20, 0));
+
+            STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1));
+            STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx10, 1));
+            STORE_4(dst1 + 16, _mm256_extractf128_ps(sumAvx20, 1));
+
+            STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0));
+            STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx11, 0));
+            STORE_4(dst2 + 16, _mm256_extractf128_ps(sumAvx21, 0));
+
+            STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1));
+            STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx11, 1));
+            STORE_4(dst3 + 16, _mm256_extractf128_ps(sumAvx21, 1));
+        } else {
+            auto tmp00 = LOAD4(dst0 + 0);
+            auto tmp01 = LOAD4(dst0 + 8);
+            auto tmp02 = LOAD4(dst0 + 16);
+
+            auto tmp10 = LOAD4(dst1 + 0);
+            auto tmp11 = LOAD4(dst1 + 8);
+            auto tmp12 = LOAD4(dst1 + 16);
+
+            auto tmp20 = LOAD4(dst2 + 0);
+            auto tmp21 = LOAD4(dst2 + 8);
+            auto tmp22 = LOAD4(dst2 + 16);
+
+            auto tmp30 = LOAD4(dst3 + 0);
+            auto tmp31 = LOAD4(dst3 + 8);
+            auto tmp32 = LOAD4(dst3 + 16);
+
+            auto sum_tmp00 = _mm256_extractf128_ps(sumAvx00, 0);
+            auto sum_tmp01 = _mm256_extractf128_ps(sumAvx10, 0);
+            auto sum_tmp02 = _mm256_extractf128_ps(sumAvx20, 0);
+            auto sum_tmp10 = _mm256_extractf128_ps(sumAvx00, 1);
+            auto sum_tmp11 = _mm256_extractf128_ps(sumAvx10, 1);
+            auto sum_tmp12 = _mm256_extractf128_ps(sumAvx20, 1);
+            auto sum_tmp20 = _mm256_extractf128_ps(sumAvx01, 0);
+            auto sum_tmp21 = _mm256_extractf128_ps(sumAvx11, 0);
+            auto sum_tmp22 = _mm256_extractf128_ps(sumAvx21, 0);
+            auto sum_tmp30 = _mm256_extractf128_ps(sumAvx01, 1);
+            auto sum_tmp31 = _mm256_extractf128_ps(sumAvx11, 1);
+            auto sum_tmp32 = _mm256_extractf128_ps(sumAvx21, 1);
+
+            sum_tmp00 = _mm_add_ps(tmp00, sum_tmp00);
+            sum_tmp01 = _mm_add_ps(tmp01, sum_tmp01);
+            sum_tmp02 = _mm_add_ps(tmp02, sum_tmp02);
+            sum_tmp10 = _mm_add_ps(tmp10, sum_tmp10);
+            sum_tmp11 = _mm_add_ps(tmp11, sum_tmp11);
+            sum_tmp12 = _mm_add_ps(tmp12, sum_tmp12);
+            sum_tmp20 = _mm_add_ps(tmp20, sum_tmp20);
+            sum_tmp21 = _mm_add_ps(tmp21, sum_tmp21);
+            sum_tmp22 = _mm_add_ps(tmp22, sum_tmp22);
+            sum_tmp30 = _mm_add_ps(tmp30, sum_tmp30);
+            sum_tmp31 = _mm_add_ps(tmp31, sum_tmp31);
+            sum_tmp32 = _mm_add_ps(tmp32, sum_tmp32);
+
+            STORE_4(dst0 + 0,  sum_tmp00);
+            STORE_4(dst0 + 8,  sum_tmp01);
+            STORE_4(dst0 + 16, sum_tmp02);
+            STORE_4(dst1 + 0,  sum_tmp10);
+            STORE_4(dst1 + 8,  sum_tmp11);
+            STORE_4(dst1 + 16, sum_tmp12);
+            STORE_4(dst2 + 0,  sum_tmp20);
+            STORE_4(dst2 + 8,  sum_tmp21);
+            STORE_4(dst2 + 16, sum_tmp22);
+            STORE_4(dst3 + 0,  sum_tmp30);
+            STORE_4(dst3 + 8,  sum_tmp31);
+            STORE_4(dst3 + 16, sum_tmp32);
+        }
 
     }
     for (int y = hR; y < hC4; ++y) {
@@ -1372,9 +1586,23 @@ static void _AVX_MNNPackedMatMul_int4_3(TYPE* C, const TYPE* A, const uint8_t* B
             z1 = MNNSSEFMA(s1, w0, z1);
             z2 = MNNSSEFMA(s2, w0, z2);
         }
-        STORE_4(dst + 8 * 0, z0);
-        STORE_4(dst + 8 * 1, z1);
-        STORE_4(dst + 8 * 2, z2);
+        if (0 == blockId) {
+            STORE_4(dst + 8 * 0, z0);
+            STORE_4(dst + 8 * 1, z1);
+            STORE_4(dst + 8 * 2, z2);
+        } else {
+            auto tmp0 = LOAD4(dst + 8 * 0);
+            auto tmp1 = LOAD4(dst + 8 * 1);
+            auto tmp2 = LOAD4(dst + 8 * 2);
+
+            z0 = _mm_add_ps(tmp0, z0);
+            z1 = _mm_add_ps(tmp1, z1);
+            z2 = _mm_add_ps(tmp2, z2);
+
+            STORE_4(dst + 8 * 0, z0);
+            STORE_4(dst + 8 * 1, z1);
+            STORE_4(dst + 8 * 2, z2);
+        }
     }
 }
 
@@ -1386,7 +1614,8 @@ static void _AVX_MNNPackedMatMul_int4_2(TYPE* C, const TYPE* A, const uint8_t* B
     auto cStride      = parameter[3] / sizeof(TYPE);
     float weightBytes = 0.5;
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     int lC4 = l / 4;
     int lR = lC4 * 4;
@@ -1426,17 +1655,55 @@ static void _AVX_MNNPackedMatMul_int4_2(TYPE* C, const TYPE* A, const uint8_t* B
             weight2 += 2;
             weight3 += 2;
         }
-        STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0));
-        STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx10, 0));
-
-        STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1));
-        STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx10, 1));
-
-        STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0));
-        STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx11, 0));
-
-        STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1));
-        STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx11, 1));
+        if (0 == blockId) {
+            STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0));
+            STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx10, 0));
+
+            STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1));
+            STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx10, 1));
+
+            STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0));
+            STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx11, 0));
+
+            STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1));
+            STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx11, 1));
+        } else {
+            auto tmp01 = LOAD4(dst0 + 0);
+            auto tmp02 = LOAD4(dst0 + 8);
+            auto tmp11 = LOAD4(dst1 + 0);
+            auto tmp12 = LOAD4(dst1 + 8);
+            auto tmp21 = LOAD4(dst2 + 0);
+            auto tmp22 = LOAD4(dst2 + 8);
+            auto tmp31 = LOAD4(dst3 + 0);
+            auto tmp32 = LOAD4(dst3 + 8);
+
+            auto x_tmp01 = _mm256_extractf128_ps(sumAvx00, 0);
+            auto x_tmp02 = _mm256_extractf128_ps(sumAvx10, 0);
+            auto x_tmp11 = _mm256_extractf128_ps(sumAvx00, 1);
+            auto x_tmp12 = _mm256_extractf128_ps(sumAvx10, 1);
+            auto x_tmp21 = _mm256_extractf128_ps(sumAvx01, 0);
+            auto x_tmp22 = _mm256_extractf128_ps(sumAvx11, 0);
+            auto x_tmp31 = _mm256_extractf128_ps(sumAvx01, 1);
+            auto x_tmp32 = _mm256_extractf128_ps(sumAvx11, 1);
+
+            x_tmp01 = _mm_add_ps(tmp01, x_tmp01);
+            x_tmp02 = _mm_add_ps(tmp02, x_tmp02);
+            x_tmp11 = _mm_add_ps(tmp11, x_tmp11);
+            x_tmp12 = _mm_add_ps(tmp12, x_tmp12);
+            x_tmp21 = _mm_add_ps(tmp21, x_tmp21);
+            x_tmp22 = _mm_add_ps(tmp22, x_tmp22);
+            x_tmp31 = _mm_add_ps(tmp31, x_tmp31);
+            x_tmp32 = _mm_add_ps(tmp32, x_tmp32);
+
+            STORE_4(dst0 + 0, x_tmp01);
+            STORE_4(dst0 + 8, x_tmp02);
+            STORE_4(dst1 + 0, x_tmp11);
+            STORE_4(dst1 + 8, x_tmp12);
+            STORE_4(dst2 + 0, x_tmp21);
+            STORE_4(dst2 + 8, x_tmp22);
+            STORE_4(dst3 + 0, x_tmp31);
+            STORE_4(dst3 + 8, x_tmp32);
+        }
 
     }
     for (int y = hR; y < hC4; ++y) {
@@ -1457,8 +1724,17 @@ static void _AVX_MNNPackedMatMul_int4_2(TYPE* C, const TYPE* A, const uint8_t* B
             z0 = MNNSSEFMA(s0, w0, z0);
             z1 = MNNSSEFMA(s1, w0, z1);
         }
-        STORE_4(dst + 8 * 0, z0);
-        STORE_4(dst + 8 * 1, z1);
+        if (0 == blockId) {
+            STORE_4(dst + 8 * 0, z0);
+            STORE_4(dst + 8 * 1, z1);
+        } else {
+            auto t0 = LOAD4(dst + 8 * 0);
+            auto t1 = LOAD4(dst + 8 * 1);
+            z0 = _mm_add_ps(z0, t0);
+            z1 = _mm_add_ps(z1, t1);
+            STORE_4(dst + 8 * 0, z0);
+            STORE_4(dst + 8 * 1, z1);
+        }
     }
 }
 
@@ -1471,11 +1747,12 @@ static void _AVX_MNNPackednMatMulRemainCommon_int4(TYPE* C, const TYPE* A, const
     auto cStride      = parameter[3] / sizeof(TYPE);
     float weightBytes = 0.5; // sizeof(int4_t)
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
     auto hC4          = UP_DIV(h, 4);
     auto es           = eSize;
     auto oC           = C;
     auto aStride      = parameter[0] / sizeof(TYPE);
+    size_t blockId    = parameter[6];
     if (eSize >= 20) {
         _AVX_MNNPackedMatMul_int4_20<TYPE>(C, A, B, parameter, k, b);
         eSize -= 20;
@@ -1597,10 +1874,25 @@ static void _AVX_MNNPackednMatMulRemainCommon_int4(TYPE* C, const TYPE* A, const
             sum3    = MNNSSEFMA(s, w3, sum3);
             srcUse += aStride;
         }
-        STORE_4(dst0, sum0);
-        STORE_4(dst1, sum1);
-        STORE_4(dst2, sum2);
-        STORE_4(dst3, sum3);
+        if (blockId == 0) {
+            STORE_4(dst0, sum0);
+            STORE_4(dst1, sum1);
+            STORE_4(dst2, sum2);
+            STORE_4(dst3, sum3);
+        } else {
+            auto tmp_0 = LOAD4(dst0);
+            auto tmp_1 = LOAD4(dst1);
+            auto tmp_2 = LOAD4(dst2);
+            auto tmp_3 = LOAD4(dst3);
+            sum0 = _mm_add_ps(tmp_0, sum0);
+            sum1 = _mm_add_ps(tmp_1, sum1);
+            sum2 = _mm_add_ps(tmp_2, sum2);
+            sum3 = _mm_add_ps(tmp_3, sum3);
+            STORE_4(dst0, sum0);
+            STORE_4(dst1, sum1);
+            STORE_4(dst2, sum2);
+            STORE_4(dst3, sum3);
+        }
     }
     for (int y = hR; y < hC4; ++y) {
         auto weight = B + y * bStride / 2;
@@ -1636,7 +1928,13 @@ static void _AVX_MNNPackednMatMulRemainCommon_int4(TYPE* C, const TYPE* A, const
             sum    = MNNSSEFMA(s, w, sum);
             srcUse += aStride;
         }
-        STORE_4(dst, sum);
+        if (blockId == 0) {
+            STORE_4(dst, sum);
+        } else {
+            auto tmp_0 = LOAD4(dst);
+            sum = _mm_add_ps(tmp_0, sum);
+            STORE_4(dst, sum);
+        }
     }
 }
 
@@ -1684,9 +1982,10 @@ static void _AVX_MNNPackedMatMul_Main_int8(TYPE* C, const TYPE* A, const TYPE* f
     auto cStride      = parameter[3] / sizeof(TYPE);
     int weightBytes = sizeof(int8_t);
     auto bExtraStride = parameter[5] / weightBytes;
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
     auto hC4          = UP_DIV(h, 4);
     float ws_tmp[4];
+    size_t blockId    = parameter[6];
     for (int y = 0; y < hC4; ++y) {
         auto weight = B + y * bStride;
         auto dst    = C + (y / 2) * cStride + 4 * (y % 2);
@@ -1737,12 +2036,21 @@ static void _AVX_MNNPackedMatMul_Main_int8(TYPE* C, const TYPE* A, const TYPE* f
             z10 = MNNAVXFMA(s1, w3, z10);
             z11 = MNNAVXFMA(s2, w3, z11);
         }
-        TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
-        TRANPOSE_SAVE(1, 0, z0, z3, z6, z9);
-        TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
-        TRANPOSE_SAVE(1, 1, z1, z4, z7, z10);
-        TRANPOSE_SAVE(0, 2, z2, z5, z8, z11);
-        TRANPOSE_SAVE(1, 2, z2, z5, z8, z11);
+        if (blockId == 0) {
+            TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            TRANPOSE_SAVE(1, 0, z0, z3, z6, z9);
+            TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+            TRANPOSE_SAVE(1, 1, z1, z4, z7, z10);
+            TRANPOSE_SAVE(0, 2, z2, z5, z8, z11);
+            TRANPOSE_SAVE(1, 2, z2, z5, z8, z11);
+        } else {
+            FMLA_TRANSPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            FMLA_TRANSPOSE_SAVE(1, 0, z0, z3, z6, z9);
+            FMLA_TRANSPOSE_SAVE(0, 1, z1, z4, z7, z10);
+            FMLA_TRANSPOSE_SAVE(1, 1, z1, z4, z7, z10);
+            FMLA_TRANSPOSE_SAVE(0, 2, z2, z5, z8, z11);
+            FMLA_TRANSPOSE_SAVE(1, 2, z2, z5, z8, z11);
+        }
     }
 }
 
@@ -1755,7 +2063,8 @@ static void _AVX_MNNPackedMatMul_int8_20(TYPE* C, const TYPE* A, const int8_t* B
     auto cStride      = parameter[3] / sizeof(TYPE);
     int weightBytes = sizeof(int8_t);
     auto bExtraStride = parameter[5] / weightBytes;
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     float ws_tmp[4];
     for (int y = 0; y < hC4; ++y) {
@@ -1807,11 +2116,19 @@ static void _AVX_MNNPackedMatMul_int8_20(TYPE* C, const TYPE* A, const int8_t* B
             z10 = MNNAVXFMA(s1, w3, z10);
             z11 = MNNAVXFMA(s2, w3, z11);
         }
-        TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
-        TRANPOSE_SAVE(1, 0, z0, z3, z6, z9);
-        TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
-        TRANPOSE_SAVE(1, 1, z1, z4, z7, z10);
-        TRANPOSE_SAVE(0, 2, z2, z5, z8, z11);
+        if (0 == blockId) {
+            TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            TRANPOSE_SAVE(1, 0, z0, z3, z6, z9);
+            TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+            TRANPOSE_SAVE(1, 1, z1, z4, z7, z10);
+            TRANPOSE_SAVE(0, 2, z2, z5, z8, z11);
+        } else {
+            FMLA_TRANSPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            FMLA_TRANSPOSE_SAVE(1, 0, z0, z3, z6, z9);
+            FMLA_TRANSPOSE_SAVE(0, 1, z1, z4, z7, z10);
+            FMLA_TRANSPOSE_SAVE(1, 1, z1, z4, z7, z10);
+            FMLA_TRANSPOSE_SAVE(0, 2, z2, z5, z8, z11);
+        }
     }
 }
 
@@ -1822,7 +2139,8 @@ static void _AVX_MNNPackedMatMul_int8_16(TYPE* C, const TYPE* A, const int8_t* B
     auto l            = parameter[1];
     auto cStride      = parameter[3] / sizeof(TYPE);
     auto bExtraStride = parameter[5] / sizeof(int8_t);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     float ws_tmp[4];
     for (int y = 0; y < hC4; ++y) {
@@ -1864,10 +2182,17 @@ static void _AVX_MNNPackedMatMul_int8_16(TYPE* C, const TYPE* A, const int8_t* B
             z9  = MNNAVXFMA(s0, w3, z9);
             z10 = MNNAVXFMA(s1, w3, z10);
         }
-        TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
-        TRANPOSE_SAVE(1, 0, z0, z3, z6, z9);
-        TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
-        TRANPOSE_SAVE(1, 1, z1, z4, z7, z10);
+        if (0 == blockId) {
+            TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            TRANPOSE_SAVE(1, 0, z0, z3, z6, z9);
+            TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+            TRANPOSE_SAVE(1, 1, z1, z4, z7, z10);
+        } else {
+            FMLA_TRANSPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            FMLA_TRANSPOSE_SAVE(1, 0, z0, z3, z6, z9);
+            FMLA_TRANSPOSE_SAVE(0, 1, z1, z4, z7, z10);
+            FMLA_TRANSPOSE_SAVE(1, 1, z1, z4, z7, z10);
+        }
     }
 }
 
@@ -1878,7 +2203,8 @@ static void _AVX_MNNPackedMatMul_int8_5(TYPE* C, const TYPE* A, const int8_t* B,
     auto l            = parameter[1];
     auto cStride      = parameter[3] / sizeof(TYPE);
     auto bExtraStride = parameter[5] / sizeof(int8_t);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     int lC4 = l / 4;
     int lR = lC4 * 4;
@@ -1939,17 +2265,53 @@ static void _AVX_MNNPackedMatMul_int8_5(TYPE* C, const TYPE* A, const int8_t* B,
             weight2 += 4;
             weight3 += 4;
         }
-        STORE_8(dst0, sumAvx00);
-        STORE_8(dst0 + 8, sumAvx10);
-        STORE_8(dst0 + 16, sumAvx20);
-        STORE_8(dst0 + 24, sumAvx30);
-        STORE_8(dst0 + 32, sumAvx40);
-
-        STORE_8(dst2, sumAvx01);
-        STORE_8(dst2 + 8, sumAvx11);
-        STORE_8(dst2 + 16, sumAvx21);
-        STORE_8(dst2 + 24, sumAvx31);
-        STORE_8(dst2 + 32, sumAvx41);
+        if (0 == blockId) {
+            STORE_8(dst0, sumAvx00);
+            STORE_8(dst0 + 8, sumAvx10);
+            STORE_8(dst0 + 16, sumAvx20);
+            STORE_8(dst0 + 24, sumAvx30);
+            STORE_8(dst0 + 32, sumAvx40);
+
+            STORE_8(dst2, sumAvx01);
+            STORE_8(dst2 + 8, sumAvx11);
+            STORE_8(dst2 + 16, sumAvx21);
+            STORE_8(dst2 + 24, sumAvx31);
+            STORE_8(dst2 + 32, sumAvx41);
+        } else {
+            auto tmp0 = LOAD8(dst0);
+            auto tmp1 = LOAD8(dst0 + 8);
+            auto tmp2 = LOAD8(dst0 + 16);
+            auto tmp3 = LOAD8(dst0 + 24);
+            auto tmp4 = LOAD8(dst0 + 32);
+            auto tmp5 = LOAD8(dst2);
+            auto tmp6 = LOAD8(dst2 + 8);
+            auto tmp7 = LOAD8(dst2 + 16);
+            auto tmp8 = LOAD8(dst2 + 24);
+            auto tmp9 = LOAD8(dst2 + 32);
+
+            sumAvx00 = _mm256_add_ps(sumAvx00, tmp0);
+            sumAvx10 = _mm256_add_ps(sumAvx10, tmp1);
+            sumAvx20 = _mm256_add_ps(sumAvx20, tmp2);
+            sumAvx30 = _mm256_add_ps(sumAvx30, tmp3);
+            sumAvx40 = _mm256_add_ps(sumAvx40, tmp4);
+            sumAvx01 = _mm256_add_ps(sumAvx01, tmp5);
+            sumAvx11 = _mm256_add_ps(sumAvx11, tmp6);
+            sumAvx21 = _mm256_add_ps(sumAvx21, tmp7);
+            sumAvx31 = _mm256_add_ps(sumAvx31, tmp8);
+            sumAvx41 = _mm256_add_ps(sumAvx41, tmp9);
+
+            STORE_8(dst0, sumAvx00);
+            STORE_8(dst0 + 8, sumAvx10);
+            STORE_8(dst0 + 16, sumAvx20);
+            STORE_8(dst0 + 24, sumAvx30);
+            STORE_8(dst0 + 32, sumAvx40);
+
+            STORE_8(dst2, sumAvx01);
+            STORE_8(dst2 + 8, sumAvx11);
+            STORE_8(dst2 + 16, sumAvx21);
+            STORE_8(dst2 + 24, sumAvx31);
+            STORE_8(dst2 + 32, sumAvx41);
+        }
     }
     for (int y = hR; y < hC4; ++y) {
         auto weight = B + y * bStride;
@@ -1981,11 +2343,31 @@ static void _AVX_MNNPackedMatMul_int8_5(TYPE* C, const TYPE* A, const int8_t* B,
             z3 = MNNSSEFMA(s3, w0, z3);
             z4 = MNNSSEFMA(s4, w0, z4);
         }
-        STORE_4(dst + 8 * 0, z0);
-        STORE_4(dst + 8 * 1, z1);
-        STORE_4(dst + 8 * 2, z2);
-        STORE_4(dst + 8 * 3, z3);
-        STORE_4(dst + 8 * 4, z4);
+        if (0 == blockId) {
+            STORE_4(dst + 8 * 0, z0);
+            STORE_4(dst + 8 * 1, z1);
+            STORE_4(dst + 8 * 2, z2);
+            STORE_4(dst + 8 * 3, z3);
+            STORE_4(dst + 8 * 4, z4);
+        } else {
+            auto tmp0 = LOAD4(dst + 8 * 0);
+            auto tmp1 = LOAD4(dst + 8 * 1);
+            auto tmp2 = LOAD4(dst + 8 * 2);
+            auto tmp3 = LOAD4(dst + 8 * 3);
+            auto tmp4 = LOAD4(dst + 8 * 4);
+
+            z0 = _mm_add_ps(tmp0, z0);
+            z1 = _mm_add_ps(tmp1, z1);
+            z2 = _mm_add_ps(tmp2, z2);
+            z3 = _mm_add_ps(tmp3, z3);
+            z4 = _mm_add_ps(tmp4, z4);
+
+            STORE_4(dst + 8 * 0, z0);
+            STORE_4(dst + 8 * 1, z1);
+            STORE_4(dst + 8 * 2, z2);
+            STORE_4(dst + 8 * 3, z3);
+            STORE_4(dst + 8 * 4, z4);
+        }
     }
 }
 
@@ -1997,7 +2379,8 @@ static void _AVX_MNNPackedMatMul_int8_4(TYPE* C, const TYPE* A, const int8_t* B,
     auto l            = parameter[1];
     auto cStride      = parameter[3] / sizeof(TYPE);
     auto bExtraStride = parameter[5] / sizeof(int8_t);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     int lC4 = l / 4;
     int lR = lC4 * 4;
@@ -2052,15 +2435,47 @@ static void _AVX_MNNPackedMatMul_int8_4(TYPE* C, const TYPE* A, const int8_t* B,
             weight2 += 4;
             weight3 += 4;
         }
-        STORE_8(dst0, sumAvx00);
-        STORE_8(dst0 + 8, sumAvx10);
-        STORE_8(dst0 + 16, sumAvx20);
-        STORE_8(dst0 + 24, sumAvx30);
-
-        STORE_8(dst2, sumAvx01);
-        STORE_8(dst2 + 8, sumAvx11);
-        STORE_8(dst2 + 16, sumAvx21);
-        STORE_8(dst2 + 24, sumAvx31);
+        if (0 == blockId) {
+            STORE_8(dst0, sumAvx00);
+            STORE_8(dst0 + 8, sumAvx10);
+            STORE_8(dst0 + 16, sumAvx20);
+            STORE_8(dst0 + 24, sumAvx30);
+
+            STORE_8(dst2, sumAvx01);
+            STORE_8(dst2 + 8, sumAvx11);
+            STORE_8(dst2 + 16, sumAvx21);
+            STORE_8(dst2 + 24, sumAvx31);
+        } else {
+            auto tmp0 = LOAD8(dst0);
+            auto tmp1 = LOAD8(dst0 + 8);
+            auto tmp2 = LOAD8(dst0 + 16);
+            auto tmp3 = LOAD8(dst0 + 24);
+
+            auto tmp5 = LOAD8(dst2);
+            auto tmp6 = LOAD8(dst2 + 8);
+            auto tmp7 = LOAD8(dst2 + 16);
+            auto tmp8 = LOAD8(dst2 + 24);
+
+            sumAvx00 = _mm256_add_ps(sumAvx00, tmp0);
+            sumAvx10 = _mm256_add_ps(sumAvx10, tmp1);
+            sumAvx20 = _mm256_add_ps(sumAvx20, tmp2);
+            sumAvx30 = _mm256_add_ps(sumAvx30, tmp3);
+
+            sumAvx01 = _mm256_add_ps(sumAvx01, tmp5);
+            sumAvx11 = _mm256_add_ps(sumAvx11, tmp6);
+            sumAvx21 = _mm256_add_ps(sumAvx21, tmp7);
+            sumAvx31 = _mm256_add_ps(sumAvx31, tmp8);
+
+            STORE_8(dst0, sumAvx00);
+            STORE_8(dst0 + 8, sumAvx10);
+            STORE_8(dst0 + 16, sumAvx20);
+            STORE_8(dst0 + 24, sumAvx30);
+
+            STORE_8(dst2, sumAvx01);
+            STORE_8(dst2 + 8, sumAvx11);
+            STORE_8(dst2 + 16, sumAvx21);
+            STORE_8(dst2 + 24, sumAvx31);
+        }
     }
     float ws_tmp[4];
     for (int y = hR; y < hC4; ++y) {
@@ -2094,10 +2509,28 @@ static void _AVX_MNNPackedMatMul_int8_4(TYPE* C, const TYPE* A, const int8_t* B,
             z9 = MNNSSEFMA(s0, w3, z9);
         }
         _MM_TRANSPOSE4_PS(z0, z3, z6, z9);
-        STORE_4(dst + 8 * 0, z0);
-        STORE_4(dst + 8 * 1, z3);
-        STORE_4(dst + 8 * 2, z6);
-        STORE_4(dst + 8 * 3, z9);
+        if (0 == blockId) {
+            STORE_4(dst + 8 * 0, z0);
+            STORE_4(dst + 8 * 1, z3);
+            STORE_4(dst + 8 * 2, z6);
+            STORE_4(dst + 8 * 3, z9);
+        } else {
+
+            auto tmp0 = LOAD4(dst + 8 * 0);
+            auto tmp1 = LOAD4(dst + 8 * 1);
+            auto tmp2 = LOAD4(dst + 8 * 2);
+            auto tmp3 = LOAD4(dst + 8 * 3);
+
+            z0 = _mm_add_ps(tmp0, z0);
+            z3 = _mm_add_ps(tmp1, z3);
+            z6 = _mm_add_ps(tmp2, z6);
+            z9 = _mm_add_ps(tmp3, z9);
+
+            STORE_4(dst + 8 * 0, z0);
+            STORE_4(dst + 8 * 1, z3);
+            STORE_4(dst + 8 * 2, z6);
+            STORE_4(dst + 8 * 3, z9);
+        }
     }
 }
 template <typename TYPE>
@@ -2107,7 +2540,8 @@ static void _AVX_MNNPackedMatMul_int8_3(TYPE* C, const TYPE* A, const int8_t* B,
     auto l            = parameter[1];
     auto cStride      = parameter[3] / sizeof(TYPE);
     auto bExtraStride = parameter[5] / sizeof(int8_t);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     int lC4 = l / 4;
     int lR = lC4 * 4;
@@ -2155,21 +2589,78 @@ static void _AVX_MNNPackedMatMul_int8_3(TYPE* C, const TYPE* A, const int8_t* B,
             weight2 += 4;
             weight3 += 4;
         }
-        STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0));
-        STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx10, 0));
-        STORE_4(dst0 + 16, _mm256_extractf128_ps(sumAvx20, 0));
-
-        STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1));
-        STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx10, 1));
-        STORE_4(dst1 + 16, _mm256_extractf128_ps(sumAvx20, 1));
-
-        STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0));
-        STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx11, 0));
-        STORE_4(dst2 + 16, _mm256_extractf128_ps(sumAvx21, 0));
-
-        STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1));
-        STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx11, 1));
-        STORE_4(dst3 + 16, _mm256_extractf128_ps(sumAvx21, 1));
+        if (0 == blockId) {
+            STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0));
+            STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx10, 0));
+            STORE_4(dst0 + 16, _mm256_extractf128_ps(sumAvx20, 0));
+
+            STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1));
+            STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx10, 1));
+            STORE_4(dst1 + 16, _mm256_extractf128_ps(sumAvx20, 1));
+
+            STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0));
+            STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx11, 0));
+            STORE_4(dst2 + 16, _mm256_extractf128_ps(sumAvx21, 0));
+
+            STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1));
+            STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx11, 1));
+            STORE_4(dst3 + 16, _mm256_extractf128_ps(sumAvx21, 1));
+        } else {
+            auto tmp00 = LOAD4(dst0 + 0);
+            auto tmp01 = LOAD4(dst0 + 8);
+            auto tmp02 = LOAD4(dst0 + 16);
+
+            auto tmp10 = LOAD4(dst1 + 0);
+            auto tmp11 = LOAD4(dst1 + 8);
+            auto tmp12 = LOAD4(dst1 + 16);
+
+            auto tmp20 = LOAD4(dst2 + 0);
+            auto tmp21 = LOAD4(dst2 + 8);
+            auto tmp22 = LOAD4(dst2 + 16);
+
+            auto tmp30 = LOAD4(dst3 + 0);
+            auto tmp31 = LOAD4(dst3 + 8);
+            auto tmp32 = LOAD4(dst3 + 16);
+
+            auto sum_tmp00 = _mm256_extractf128_ps(sumAvx00, 0);
+            auto sum_tmp01 = _mm256_extractf128_ps(sumAvx10, 0);
+            auto sum_tmp02 = _mm256_extractf128_ps(sumAvx20, 0);
+            auto sum_tmp10 = _mm256_extractf128_ps(sumAvx00, 1);
+            auto sum_tmp11 = _mm256_extractf128_ps(sumAvx10, 1);
+            auto sum_tmp12 = _mm256_extractf128_ps(sumAvx20, 1);
+            auto sum_tmp20 = _mm256_extractf128_ps(sumAvx01, 0);
+            auto sum_tmp21 = _mm256_extractf128_ps(sumAvx11, 0);
+            auto sum_tmp22 = _mm256_extractf128_ps(sumAvx21, 0);
+            auto sum_tmp30 = _mm256_extractf128_ps(sumAvx01, 1);
+            auto sum_tmp31 = _mm256_extractf128_ps(sumAvx11, 1);
+            auto sum_tmp32 = _mm256_extractf128_ps(sumAvx21, 1);
+
+            sum_tmp00 = _mm_add_ps(tmp00, sum_tmp00);
+            sum_tmp01 = _mm_add_ps(tmp01, sum_tmp01);
+            sum_tmp02 = _mm_add_ps(tmp02, sum_tmp02);
+            sum_tmp10 = _mm_add_ps(tmp10, sum_tmp10);
+            sum_tmp11 = _mm_add_ps(tmp11, sum_tmp11);
+            sum_tmp12 = _mm_add_ps(tmp12, sum_tmp12);
+            sum_tmp20 = _mm_add_ps(tmp20, sum_tmp20);
+            sum_tmp21 = _mm_add_ps(tmp21, sum_tmp21);
+            sum_tmp22 = _mm_add_ps(tmp22, sum_tmp22);
+            sum_tmp30 = _mm_add_ps(tmp30, sum_tmp30);
+            sum_tmp31 = _mm_add_ps(tmp31, sum_tmp31);
+            sum_tmp32 = _mm_add_ps(tmp32, sum_tmp32);
+
+            STORE_4(dst0 + 0,  sum_tmp00);
+            STORE_4(dst0 + 8,  sum_tmp01);
+            STORE_4(dst0 + 16, sum_tmp02);
+            STORE_4(dst1 + 0,  sum_tmp10);
+            STORE_4(dst1 + 8,  sum_tmp11);
+            STORE_4(dst1 + 16, sum_tmp12);
+            STORE_4(dst2 + 0,  sum_tmp20);
+            STORE_4(dst2 + 8,  sum_tmp21);
+            STORE_4(dst2 + 16, sum_tmp22);
+            STORE_4(dst3 + 0,  sum_tmp30);
+            STORE_4(dst3 + 8,  sum_tmp31);
+            STORE_4(dst3 + 16, sum_tmp32);
+        }
 
     }
     for (int y = hR; y < hC4; ++y) {
@@ -2194,9 +2685,23 @@ static void _AVX_MNNPackedMatMul_int8_3(TYPE* C, const TYPE* A, const int8_t* B,
             z1 = MNNSSEFMA(s1, w0, z1);
             z2 = MNNSSEFMA(s2, w0, z2);
         }
-        STORE_4(dst + 8 * 0, z0);
-        STORE_4(dst + 8 * 1, z1);
-        STORE_4(dst + 8 * 2, z2);
+        if (0 == blockId) {
+            STORE_4(dst + 8 * 0, z0);
+            STORE_4(dst + 8 * 1, z1);
+            STORE_4(dst + 8 * 2, z2);
+        } else {
+            auto tmp0 = LOAD4(dst + 8 * 0);
+            auto tmp1 = LOAD4(dst + 8 * 1);
+            auto tmp2 = LOAD4(dst + 8 * 2);
+
+            z0 = _mm_add_ps(tmp0, z0);
+            z1 = _mm_add_ps(tmp1, z1);
+            z2 = _mm_add_ps(tmp2, z2);
+
+            STORE_4(dst + 8 * 0, z0);
+            STORE_4(dst + 8 * 1, z1);
+            STORE_4(dst + 8 * 2, z2);
+        }
     }
 }
 
@@ -2207,7 +2712,8 @@ static void _AVX_MNNPackedMatMul_int8_2(TYPE* C, const TYPE* A, const int8_t* B,
     auto l            = parameter[1];
     auto cStride      = parameter[3] / sizeof(TYPE);
     auto bExtraStride = parameter[5] / sizeof(int8_t);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     int lC4 = l / 4;
     int lR = lC4 * 4;
@@ -2247,17 +2753,55 @@ static void _AVX_MNNPackedMatMul_int8_2(TYPE* C, const TYPE* A, const int8_t* B,
             weight2 += 4;
             weight3 += 4;
         }
-        STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0));
-        STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx10, 0));
-
-        STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1));
-        STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx10, 1));
-
-        STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0));
-        STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx11, 0));
-
-        STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1));
-        STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx11, 1));
+        if (0 == blockId) {
+            STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0));
+            STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx10, 0));
+
+            STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1));
+            STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx10, 1));
+
+            STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0));
+            STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx11, 0));
+
+            STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1));
+            STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx11, 1));
+        } else {
+            auto tmp01 = LOAD4(dst0 + 0);
+            auto tmp02 = LOAD4(dst0 + 8);
+            auto tmp11 = LOAD4(dst1 + 0);
+            auto tmp12 = LOAD4(dst1 + 8);
+            auto tmp21 = LOAD4(dst2 + 0);
+            auto tmp22 = LOAD4(dst2 + 8);
+            auto tmp31 = LOAD4(dst3 + 0);
+            auto tmp32 = LOAD4(dst3 + 8);
+
+            auto x_tmp01 = _mm256_extractf128_ps(sumAvx00, 0);
+            auto x_tmp02 = _mm256_extractf128_ps(sumAvx10, 0);
+            auto x_tmp11 = _mm256_extractf128_ps(sumAvx00, 1);
+            auto x_tmp12 = _mm256_extractf128_ps(sumAvx10, 1);
+            auto x_tmp21 = _mm256_extractf128_ps(sumAvx01, 0);
+            auto x_tmp22 = _mm256_extractf128_ps(sumAvx11, 0);
+            auto x_tmp31 = _mm256_extractf128_ps(sumAvx01, 1);
+            auto x_tmp32 = _mm256_extractf128_ps(sumAvx11, 1);
+
+            x_tmp01 = _mm_add_ps(tmp01, x_tmp01);
+            x_tmp02 = _mm_add_ps(tmp02, x_tmp02);
+            x_tmp11 = _mm_add_ps(tmp11, x_tmp11);
+            x_tmp12 = _mm_add_ps(tmp12, x_tmp12);
+            x_tmp21 = _mm_add_ps(tmp21, x_tmp21);
+            x_tmp22 = _mm_add_ps(tmp22, x_tmp22);
+            x_tmp31 = _mm_add_ps(tmp31, x_tmp31);
+            x_tmp32 = _mm_add_ps(tmp32, x_tmp32);
+
+            STORE_4(dst0 + 0, x_tmp01);
+            STORE_4(dst0 + 8, x_tmp02);
+            STORE_4(dst1 + 0, x_tmp11);
+            STORE_4(dst1 + 8, x_tmp12);
+            STORE_4(dst2 + 0, x_tmp21);
+            STORE_4(dst2 + 8, x_tmp22);
+            STORE_4(dst3 + 0, x_tmp31);
+            STORE_4(dst3 + 8, x_tmp32);
+        }
 
     }
     for (int y = hR; y < hC4; ++y) {
@@ -2278,8 +2822,17 @@ static void _AVX_MNNPackedMatMul_int8_2(TYPE* C, const TYPE* A, const int8_t* B,
             z0 = MNNSSEFMA(s0, w0, z0);
             z1 = MNNSSEFMA(s1, w0, z1);
         }
-        STORE_4(dst + 8 * 0, z0);
-        STORE_4(dst + 8 * 1, z1);
+        if (0 == blockId) {
+            STORE_4(dst + 8 * 0, z0);
+            STORE_4(dst + 8 * 1, z1);
+        } else {
+            auto t0 = LOAD4(dst + 8 * 0);
+            auto t1 = LOAD4(dst + 8 * 1);
+            z0 = _mm_add_ps(z0, t0);
+            z1 = _mm_add_ps(z1, t1);
+            STORE_4(dst + 8 * 0, z0);
+            STORE_4(dst + 8 * 1, z1);
+        }
     }
 }
 
@@ -2291,7 +2844,8 @@ static void _AVX_MNNPackednMatMulRemainCommon_int8(TYPE* C, const TYPE* A, const
     auto l            = parameter[1];
     auto cStride      = parameter[3] / sizeof(TYPE);
     auto bExtraStride = parameter[5] / sizeof(int8_t);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     auto es           = eSize;
     auto oC           = C;
@@ -2417,10 +2971,25 @@ static void _AVX_MNNPackednMatMulRemainCommon_int8(TYPE* C, const TYPE* A, const
             sum3    = MNNSSEFMA(s, w3, sum3);
             srcUse += aStride;
         }
-        STORE_4(dst0, sum0);
-        STORE_4(dst1, sum1);
-        STORE_4(dst2, sum2);
-        STORE_4(dst3, sum3);
+        if (blockId == 0) {
+            STORE_4(dst0, sum0);
+            STORE_4(dst1, sum1);
+            STORE_4(dst2, sum2);
+            STORE_4(dst3, sum3);
+        } else {
+            auto tmp_0 = LOAD4(dst0);
+            auto tmp_1 = LOAD4(dst1);
+            auto tmp_2 = LOAD4(dst2);
+            auto tmp_3 = LOAD4(dst3);
+            sum0 = _mm_add_ps(tmp_0, sum0);
+            sum1 = _mm_add_ps(tmp_1, sum1);
+            sum2 = _mm_add_ps(tmp_2, sum2);
+            sum3 = _mm_add_ps(tmp_3, sum3);
+            STORE_4(dst0, sum0);
+            STORE_4(dst1, sum1);
+            STORE_4(dst2, sum2);
+            STORE_4(dst3, sum3);
+        }
     }
     for (int y = hR; y < hC4; ++y) {
         auto weight = B + y * bStride;
@@ -2456,7 +3025,14 @@ static void _AVX_MNNPackednMatMulRemainCommon_int8(TYPE* C, const TYPE* A, const
             sum    = MNNSSEFMA(s, w, sum);
             srcUse += aStride;
         }
-        STORE_4(dst, sum);
+        if (blockId == 0) {
+            STORE_4(dst, sum);
+        } else {
+            auto tmp_0 = LOAD4(dst);
+            sum = _mm_add_ps(tmp_0, sum);
+            STORE_4(dst, sum);
+            
+        }
     }
 }
 
diff --git a/source/backend/cpu/x86_x64/sse/GemmCommon.hpp b/source/backend/cpu/x86_x64/sse/GemmCommon.hpp
index 007a55c7c..8d555e95c 100644
--- a/source/backend/cpu/x86_x64/sse/GemmCommon.hpp
+++ b/source/backend/cpu/x86_x64/sse/GemmCommon.hpp
@@ -24,6 +24,27 @@
         _mm_storeu_ps(dst + 4 * (3 + 4 * v), m3); \
     }
 
+#define FMLA_TRANPOSE_SAVE(u, v, z0, z3, z6, z9)      \
+    {                                            \
+        auto m0 = z0;                            \
+        auto m1 = z3;                            \
+        auto m2 = z6;                            \
+        auto m3 = z9;                            \
+        _MM_TRANSPOSE4_PS(m0, m1, m2, m3);       \
+        auto t0 = _mm_loadu_ps(dst + 4 * (0 + 4 * v));\
+        auto t1 = _mm_loadu_ps(dst + 4 * (1 + 4 * v));\
+        auto t2 = _mm_loadu_ps(dst + 4 * (2 + 4 * v));\
+        auto t3 = _mm_loadu_ps(dst + 4 * (3 + 4 * v));\
+        m0 = _mm_add_ps(m0, t0);\
+        m1 = _mm_add_ps(m1, t1);\
+        m2 = _mm_add_ps(m2, t2);\
+        m3 = _mm_add_ps(m3, t3);\
+        _mm_storeu_ps(dst + 4 * (0 + 4 * v), m0); \
+        _mm_storeu_ps(dst + 4 * (1 + 4 * v), m1); \
+        _mm_storeu_ps(dst + 4 * (2 + 4 * v), m2); \
+        _mm_storeu_ps(dst + 4 * (3 + 4 * v), m3); \
+    }
+
 void _SSE_GemmPostTreat(float* C, size_t eSize, const size_t* parameter, const float* postParameters,
                         const float* bias);
 #endif
diff --git a/source/backend/cpu/x86_x64/sse/GemmFunction.hpp b/source/backend/cpu/x86_x64/sse/GemmFunction.hpp
index 57c2558b0..e0272c184 100644
--- a/source/backend/cpu/x86_x64/sse/GemmFunction.hpp
+++ b/source/backend/cpu/x86_x64/sse/GemmFunction.hpp
@@ -224,8 +224,9 @@ static void _SSE_MNNPackedMatMul_12_int4(float* C, const float* A, const float*
     auto cStride      = parameter[3] / sizeof(float);
     float weightBytes = 0.5; // sizeof(int4_t)
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
     auto hC4          = UP_DIV(h, 4);
+    auto blockId      = parameter[6];
     float ws_tmp[4];
     for (int y = 0; y < hC4; ++y) {
         auto weight = B + y * bStride / 2;
@@ -277,9 +278,15 @@ static void _SSE_MNNPackedMatMul_12_int4(float* C, const float* A, const float*
             z10 = MNNSSEFMA(s1, w3, z10);
             z11 = MNNSSEFMA(s2, w3, z11);
         }
-        TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
-        TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
-        TRANPOSE_SAVE(0, 2, z2, z5, z8, z11);
+        if (0 == blockId) {
+            TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+            TRANPOSE_SAVE(0, 2, z2, z5, z8, z11);
+        } else {
+            FMLA_TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            FMLA_TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+            FMLA_TRANPOSE_SAVE(0, 2, z2, z5, z8, z11);
+        }
     }
 }
 
@@ -290,7 +297,8 @@ static void _SSE_MNNPackedMatMul_8_int4(float* C, const float* A, const uint8_t*
     auto cStride      = parameter[3] / sizeof(float);
     float weightBytes = 0.5; // sizeof(int4_t)
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     float ws_tmp[4];
     for (int y = 0; y < hC4; ++y) {
@@ -333,8 +341,13 @@ static void _SSE_MNNPackedMatMul_8_int4(float* C, const float* A, const uint8_t*
             z7  = MNNSSEFMA(s1, w2, z7);
             z10 = MNNSSEFMA(s1, w3, z10);
         }
-        TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
-        TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+        if (0 == blockId) {
+            TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+        } else {
+            FMLA_TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            FMLA_TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+        }
     }
 }
 
@@ -345,7 +358,8 @@ static void _SSE_MNNPackedMatMul_4_int4(float* C, const float* A, const uint8_t*
     auto cStride      = parameter[3] / sizeof(float);
     float weightBytes = 0.5;
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     float ws_tmp[4];
     for (int y = 0; y < hC4; ++y) {
@@ -379,10 +393,26 @@ static void _SSE_MNNPackedMatMul_4_int4(float* C, const float* A, const uint8_t*
             z9 = MNNSSEFMA(s0, w3, z9);
         }
         _MM_TRANSPOSE4_PS(z0, z3, z6, z9);
-        _mm_storeu_ps(dst + 4 * 0, z0);
-        _mm_storeu_ps(dst + 4 * 1, z3);
-        _mm_storeu_ps(dst + 4 * 2, z6);
-        _mm_storeu_ps(dst + 4 * 3, z9);
+        if (0 == blockId) {
+            _mm_storeu_ps(dst + 4 * 0, z0);
+            _mm_storeu_ps(dst + 4 * 1, z3);
+            _mm_storeu_ps(dst + 4 * 2, z6);
+            _mm_storeu_ps(dst + 4 * 3, z9);
+        } else {
+            auto t0 = _mm_loadu_ps(dst + 4 * 0);
+            auto t1 = _mm_loadu_ps(dst + 4 * 1);
+            auto t2 = _mm_loadu_ps(dst + 4 * 2);
+            auto t3 = _mm_loadu_ps(dst + 4 * 3);
+
+            z0 = _mm_add_ps(z0, t0);
+            z3 = _mm_add_ps(z3, t1);
+            z6 = _mm_add_ps(z6, t2);
+            z9 = _mm_add_ps(z9, t3);
+            _mm_storeu_ps(dst + 4 * 0, z0);
+            _mm_storeu_ps(dst + 4 * 1, z3);
+            _mm_storeu_ps(dst + 4 * 2, z6);
+            _mm_storeu_ps(dst + 4 * 3, z9);
+        }
     }
 }
 
@@ -394,7 +424,8 @@ static void _SSE_MNNPackednMatMulRemainCommon_int4(float* C, const float* A, con
     auto cStride      = parameter[3] / sizeof(float);
     float weightBytes = 0.5; // sizeof(int4_t)
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes); // parameter[5]/weightBytes
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     auto es           = eSize;
     auto oC           = C;
@@ -424,7 +455,15 @@ static void _SSE_MNNPackednMatMulRemainCommon_int4(float* C, const float* A, con
                 auto w = _load_int4x4(weight + sy * 2, alpha, bias);
                 sum    = MNNSSEFMA(s, w, sum);
             }
-            _mm_storeu_ps(dst, sum);
+            if (0 == blockId) {
+                _mm_storeu_ps(dst, sum);
+            } else {
+                auto tmp = _mm_loadu_ps(dst);
+                sum = _mm_add_ps(sum, tmp);
+                _mm_storeu_ps(dst, sum);
+            }
+            
+
         }
     }
 }
@@ -446,7 +485,8 @@ static void _SSE_MNNPackedMatMul_12_int8(float* C, const float* A, const float*
     auto cStride      = parameter[3] / sizeof(float);
     float weightBytes = 1; // sizeof(int8_t)
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     float ws_tmp[4];
     for (int y = 0; y < hC4; ++y) {
@@ -499,9 +539,16 @@ static void _SSE_MNNPackedMatMul_12_int8(float* C, const float* A, const float*
             z10 = MNNSSEFMA(s1, w3, z10);
             z11 = MNNSSEFMA(s2, w3, z11);
         }
-        TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
-        TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
-        TRANPOSE_SAVE(0, 2, z2, z5, z8, z11);
+        if (0 == blockId) {
+            TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+            TRANPOSE_SAVE(0, 2, z2, z5, z8, z11);
+        } else {
+            FMLA_TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            FMLA_TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+            FMLA_TRANPOSE_SAVE(0, 2, z2, z5, z8, z11);
+        }
+        
     }
 }
 
@@ -512,7 +559,8 @@ static void _SSE_MNNPackedMatMul_8_int8(float* C, const float* A, const int8_t*
     auto cStride      = parameter[3] / sizeof(float);
     float weightBytes = 1; // sizeof(int8_t)
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     float ws_tmp[4];
     for (int y = 0; y < hC4; ++y) {
@@ -555,8 +603,14 @@ static void _SSE_MNNPackedMatMul_8_int8(float* C, const float* A, const int8_t*
             z7  = MNNSSEFMA(s1, w2, z7);
             z10 = MNNSSEFMA(s1, w3, z10);
         }
-        TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
-        TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+        if (0 == blockId) {
+            TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+        } else {
+            FMLA_TRANPOSE_SAVE(0, 0, z0, z3, z6, z9);
+            FMLA_TRANPOSE_SAVE(0, 1, z1, z4, z7, z10);
+        }
+        
     }
 }
 
@@ -567,7 +621,8 @@ static void _SSE_MNNPackedMatMul_4_int8(float* C, const float* A, const int8_t*
     auto cStride      = parameter[3] / sizeof(float);
     float weightBytes = 1; // sizeof(int8_t)
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
+    auto blockId      = parameter[6];
     auto hC4          = UP_DIV(h, 4);
     float ws_tmp[4];
     for (int y = 0; y < hC4; ++y) {
@@ -601,10 +656,27 @@ static void _SSE_MNNPackedMatMul_4_int8(float* C, const float* A, const int8_t*
             z9 = MNNSSEFMA(s0, w3, z9);
         }
         _MM_TRANSPOSE4_PS(z0, z3, z6, z9);
-        _mm_storeu_ps(dst + 4 * 0, z0);
-        _mm_storeu_ps(dst + 4 * 1, z3);
-        _mm_storeu_ps(dst + 4 * 2, z6);
-        _mm_storeu_ps(dst + 4 * 3, z9);
+        if (0 == blockId) {
+            _mm_storeu_ps(dst + 4 * 0, z0);
+            _mm_storeu_ps(dst + 4 * 1, z3);
+            _mm_storeu_ps(dst + 4 * 2, z6);
+            _mm_storeu_ps(dst + 4 * 3, z9);
+        } else {
+            auto t0 = _mm_loadu_ps(dst + 4 * 0);
+            auto t1 = _mm_loadu_ps(dst + 4 * 1);
+            auto t2 = _mm_loadu_ps(dst + 4 * 2);
+            auto t3 = _mm_loadu_ps(dst + 4 * 3);
+            
+            z0 = _mm_add_ps(t0, z0);
+            z3 = _mm_add_ps(t1, z3);
+            z6 = _mm_add_ps(t2, z6);
+            z9 = _mm_add_ps(t3, z9);
+            
+            _mm_storeu_ps(dst + 4 * 0, z0);
+            _mm_storeu_ps(dst + 4 * 1, z3);
+            _mm_storeu_ps(dst + 4 * 2, z6);
+            _mm_storeu_ps(dst + 4 * 3, z9);
+        }
     }
 }
 
@@ -616,11 +688,12 @@ static void _SSE_MNNPackednMatMulRemainCommon_int8(float* C, const float* A, con
     auto cStride      = parameter[3] / sizeof(float);
     float weightBytes = 1; // sizeof(int8_t)
     auto bExtraStride = static_cast<int32_t>(parameter[5] / weightBytes);
-    auto bStride      = bExtraStride + l * 4;
+    auto bStride      = bExtraStride + 4 * l;
     auto hC4          = UP_DIV(h, 4);
     auto es           = eSize;
     auto oC           = C;
     auto aStride      = parameter[0] / sizeof(float);
+    auto blockId      = parameter[6];
     if (eSize >= 8) {
         _SSE_MNNPackedMatMul_8_int8(C, A, B, parameter, k, b);
         eSize -= 8;
@@ -646,7 +719,13 @@ static void _SSE_MNNPackednMatMulRemainCommon_int8(float* C, const float* A, con
                 auto w = _load_int8x4(weight + sy * 4, alpha, bias);
                 sum    = MNNSSEFMA(s, w, sum);
             }
-            _mm_storeu_ps(dst, sum);
+            if (blockId == 0) {
+                _mm_storeu_ps(dst, sum);
+            } else {
+                auto t = _mm_loadu_ps(dst);
+                sum = _mm_add_ps(sum, t);
+                _mm_storeu_ps(dst, sum);
+            }
         }
     }
 }
diff --git a/source/backend/cpu/x86_x64/sse/GemmSSE.cpp b/source/backend/cpu/x86_x64/sse/GemmSSE.cpp
index accd3f8dd..7d5699e96 100644
--- a/source/backend/cpu/x86_x64/sse/GemmSSE.cpp
+++ b/source/backend/cpu/x86_x64/sse/GemmSSE.cpp
@@ -35,13 +35,17 @@ void _SSE_MNNPackedMatMul_int4(float* C, const float* A, const float* B, const s
     auto hC4     = UP_DIV(h, 4);
     auto cStride = parameter[3] / sizeof(float);
     _SSE_MNNPackedMatMul_12_int4(C, A, B, parameter, k, b);
-    _SSE_GemmPostTreat(C, 12, parameter, postParameters, bias);
+    if (nullptr != bias) {
+        _SSE_GemmPostTreat(C, 12, parameter, postParameters, bias);
+    }
 }
 
 void _SSE_MNNPackedMatMulRemain_int4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
                                      const float* postParameters, const float* bias, const float* k, const float* b) {
     _SSE_MNNPackednMatMulRemainCommon_int4(C, A, B, eSize, parameter, postParameters, bias, k, b);
-    _SSE_GemmPostTreat(C, eSize, parameter, postParameters, bias);
+    if (nullptr != bias) {
+        _SSE_GemmPostTreat(C, eSize, parameter, postParameters, bias);
+    }
 }
 
 void _SSE_MNNPackedMatMul_int8(float* C, const float* A, const float* B, const size_t* parameter,
@@ -50,13 +54,17 @@ void _SSE_MNNPackedMatMul_int8(float* C, const float* A, const float* B, const s
     auto hC4     = UP_DIV(h, 4);
     auto cStride = parameter[3] / sizeof(float);
     _SSE_MNNPackedMatMul_12_int8(C, A, B, parameter, k, b);
-    _SSE_GemmPostTreat(C, 12, parameter, postParameters, bias);
+    if (nullptr != bias) {
+        _SSE_GemmPostTreat(C, 12, parameter, postParameters, bias);
+    }
 }
 
 void _SSE_MNNPackedMatMulRemain_int8(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
                                      const float* postParameters, const float* bias, const float* k, const float* b) {
     _SSE_MNNPackednMatMulRemainCommon_int8(C, A, B, eSize, parameter, postParameters, bias, k, b);
-    _SSE_GemmPostTreat(C, eSize, parameter, postParameters, bias);
+    if (nullptr != bias) {
+        _SSE_GemmPostTreat(C, eSize, parameter, postParameters, bias);
+    }
 }
 
 void _SSE_MNNGemmHybridInt4(float* C, const int8_t* A, const int8_t* B, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, size_t realSize, const float** param) {
diff --git a/source/backend/metal/AllShader.cpp b/source/backend/metal/AllShader.cpp
index 620e6ce99..2fb7cacc2 100644
--- a/source/backend/metal/AllShader.cpp
+++ b/source/backend/metal/AllShader.cpp
@@ -1659,6 +1659,7 @@ const char* shader_MetalConvolution1x1_metal =
 " int output_slice;\n"
 " int output_channel;\n"
 " int batch;\n"
+" int block_size;\n"
 " conv_activation_type activation;\n"
 "};\n"
 "kernel void conv1x1_g1z4(const device M4 *in [[buffer(0)]],\n"
@@ -1712,29 +1713,32 @@ const char* shader_MetalConvolution1x1_metal =
 " auto biasValue=FLOAT4(biasTerms[uz]);\n"
 " FLOAT4 result0=biasValue,result1=biasValue,result2=biasValue,result3=biasValue;\n"
 " int computeSize=min(cst.output_size-rx,CONV_UNROLL);\n"
-" auto scale=FLOAT4(dequantScale[uz]);\n"
-" auto dequant_bias=FLOAT4(dequantScale[uz+cst.output_slice]);\n"
-" for (auto z=0; z<cst.input_slice; z++) {\n"
+" int block=(cst.input_slice+cst.block_size-1)/cst.block_size;\n"
+" for (int bi=0; bi<cst.block_size; ++bi) {\n"
+" FLOAT4 bs0=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+0]);\n"
+" FLOAT4 bs1=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+1]);\n"
+" FLOAT4 scale=bs0;\n"
+" FLOAT4 dequant_bias=bs1;\n"
+" int zmin=bi*block;\n"
+" int zmax=min(zmin+block,cst.input_slice);\n"
+" for (int z=zmin; z<zmax; z++) {\n"
 " auto in40=(FLOAT4)*xy_in0;\n"
 " auto in41=(FLOAT4)*(xy_in0+1);\n"
 " auto in42=(FLOAT4)*(xy_in0+2);\n"
 " auto in43=(FLOAT4)*(xy_in0+3);\n"
 " auto w=xy_wt[z];\n"
-" \n"
-" /* weight int8->float */\n"
 " FLOAT4x4 w_fp32=FLOAT4x4(FLOAT4(w[0]),FLOAT4(w[1]),FLOAT4(w[2]),FLOAT4(w[3]));\n"
 " FLOAT4x4 w_dequant;\n"
 " for (int i=0; i<4; ++i) {\n"
 " w_dequant[i]=w_fp32[i]*scale[i]+dequant_bias[i];\n"
 " }\n"
-" \n"
 " result0 += FLOAT4(in40*w_dequant);\n"
 " result1 += FLOAT4(in41*w_dequant);\n"
 " result2 += FLOAT4(in42*w_dequant);\n"
 " result3 += FLOAT4(in43*w_dequant);\n"
 " xy_in0 += cst.input_size*cst.batch;\n"
 " }\n"
-" \n"
+" }\n"
 " /* true */ \n"
 " xy_out[0]=activate(M4(result0),cst.activation);\n"
 " if (computeSize>1) {xy_out[1]=activate(M4(result1),cst.activation); }\n"
@@ -1757,9 +1761,13 @@ const char* shader_MetalConvolution1x1_metal =
 " auto biasValue=FLOAT4(biasTerms[uz]);\n"
 " FLOAT4 result0=biasValue,result1=biasValue,result2=biasValue,result3=biasValue;\n"
 " int computeSize=min(cst.output_size-rx,CONV_UNROLL);\n"
-" auto scale=FLOAT4(dequantScale[uz]);\n"
-" auto dequant_bias=FLOAT4(dequantScale[uz+cst.output_slice]);\n"
-" for (auto z=0; z<cst.input_slice; z++) {\n"
+" int block=(cst.input_slice+cst.block_size-1)/cst.block_size;\n"
+" for (int bi=0; bi<cst.block_size; ++bi) {\n"
+" FLOAT4 scale=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+0]);\n"
+" FLOAT4 dequant_bias=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+1]);\n"
+" int zmin=bi*block;\n"
+" int zmax=min(zmin+block,cst.input_slice);\n"
+" for (int z=zmin; z<zmax; z++) {\n"
 " auto in40=(FLOAT4)*xy_in0;\n"
 " auto in41=(FLOAT4)*(xy_in0+1);\n"
 " auto in42=(FLOAT4)*(xy_in0+2);\n"
@@ -1775,13 +1783,13 @@ const char* shader_MetalConvolution1x1_metal =
 " FLOAT4 res=w4*scale[i]+dequant_bias[i];\n"
 " w_dequant[i]=res;\n"
 " }\n"
-" \n"
 " result0 += FLOAT4(in40*w_dequant);\n"
 " result1 += FLOAT4(in41*w_dequant);\n"
 " result2 += FLOAT4(in42*w_dequant);\n"
 " result3 += FLOAT4(in43*w_dequant);\n"
 " xy_in0 += cst.input_size*cst.batch;\n"
 " }\n"
+" }\n"
 " \n"
 " /* true */ \n"
 " xy_out[0]=activate(M4(result0),cst.activation);\n"
diff --git a/source/backend/metal/CMakeLists.txt b/source/backend/metal/CMakeLists.txt
index 4f730a6fa..63cb82b83 100644
--- a/source/backend/metal/CMakeLists.txt
+++ b/source/backend/metal/CMakeLists.txt
@@ -4,26 +4,8 @@ IF(MNN_SUPPORT_RENDER)
     list(APPEND MNN_Metal_SRC ${MNN_Metal_Render_SRC})
 ENDIF()
 FILE(GLOB MNN_Metal_KERNELS_SRC ${CMAKE_CURRENT_LIST_DIR}/*.metal)
-option(MNN_METALLIB_SOURCE "Use Metal Source Directly" ON)
 add_library(MNNMetal OBJECT ${MNN_Metal_SRC} "${CMAKE_CURRENT_LIST_DIR}/MetalOPRegister.mm")
 target_compile_options(MNNMetal PRIVATE -DMNN_METAL_ENABLED=1)
-if (MNN_METALLIB_SOURCE)
-    target_compile_options(MNNMetal PRIVATE -DMNN_METALLIB_SOURCE)
-else()
-    message(STATUS "Generating mnn.metallib at ${CMAKE_CURRENT_BINARY_DIR}/mnn.metallib")
-    IF(DEFINED SDK_VERSION)
-        #Defined by iOS toolchain
-        SET(METAL_SDK_PLAT "iphoneos")
-    ELSE()
-        SET(METAL_SDK_PLAT "macosx")
-    ENDIF()
-    message(STATUS "Compiling Metal Kernels with ${METAL_SDK_PLAT} SDK")
-    add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/mnn.metallib COMMAND xcrun -sdk ${METAL_SDK_PLAT} metal "${MNN_Metal_KERNELS_SRC}" -o ${CMAKE_CURRENT_BINARY_DIR}/mnn.metallib COMMAND_EXPAND_LISTS)
-    add_custom_target(MNNMetalLib DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/mnn.metallib COMMENT "Generating mnn.metallib")
-    add_dependencies(MNNMetal MNNMetalLib)
-    # Adding mnn.metallib into MNN.framework/Resources need provide it as add_library's argument
-    list(APPEND MNN_OBJECTS_TO_LINK ${CMAKE_CURRENT_BINARY_DIR}/mnn.metallib)
-endif()
 set_property(TARGET MNNMetal APPEND_STRING PROPERTY COMPILE_FLAGS "-fobjc-arc")
 
 list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNMetal>)
diff --git a/source/backend/metal/MetalAttention.mm b/source/backend/metal/MetalAttention.mm
index 5bdba5fff..2c6eed591 100644
--- a/source/backend/metal/MetalAttention.mm
+++ b/source/backend/metal/MetalAttention.mm
@@ -24,6 +24,7 @@
     int query_seq_len;
     int key_seq_len;
     int head_num;
+    int group;
     int head_dim;
     float scale;
 };
@@ -45,18 +46,21 @@ kernel void main0(const device T* input0 [[buffer(0)]],
     if (x >= param.query_seq_len || y >= param.head_num || z >= param.key_seq_len) {
         return;
     }
+    int group = param.group;
     int query_seq_len = param.query_seq_len;
     int key_seq_len = param.key_seq_len;
     int head_num = param.head_num;
     int head_dim = param.head_dim;
+    int yr = y % param.group;
     
     const int offset = head_num * head_dim;
     const int offset_head = y * head_dim;
+    const int offset_head_kv = (y / param.group) * head_dim;
     const device T* A_offset = input0 + x * offset + offset_head;
-    device T* Pastkey_offset = past_key + z * offset + offset_head;
+    device T* Pastkey_offset = past_key + z * offset / group + offset_head_kv;
     float Vscale = (float)param.scale;
 #ifdef FOR_PREFILL
-    device const T* B_offset = input1 + z * offset + offset_head;
+    device const T* B_offset = input1 + z * offset / group + offset_head_kv;
     const int output_offset = y * query_seq_len * key_seq_len;
     float out0 = 0.0;
     
@@ -64,7 +68,9 @@ kernel void main0(const device T* input0 [[buffer(0)]],
         float A = (float)(A_offset[i]);
         float B = (float)(B_offset[i]);
         out0 += B * A;
-        Pastkey_offset[i] = (T)B;
+        if (yr == 0) {
+            Pastkey_offset[i] = (T)B;
+        }
     }
     
     out0 *= Vscale;
@@ -76,14 +82,16 @@ kernel void main0(const device T* input0 [[buffer(0)]],
 #endif
     output[output_offset + x * key_seq_len + z] = (T)out0;
 #else
-    const device T *B_offset = input1 + offset_head;
+    const device T *B_offset = input1 + offset_head_kv;
     float out = 0.0;
     if (z == key_seq_len - 1) {
         for(int i = 0; i < head_dim; ++i){
             float A = (float)(A_offset[i]);
             float B = (float)(B_offset[i]);
             out += B * A;
-            Pastkey_offset[i] = (T)B;
+            if (yr == 0) {
+                Pastkey_offset[i] = (T)B;
+            }
         }
     } else {
         for(int i = 0; i < head_dim; ++i){
@@ -109,6 +117,7 @@ kernel void main0(const device T* input0 [[buffer(0)]],
     int query_seq_len;
     int key_seq_len;
     int head_num;
+    int group;
     int head_dim;
     float scale;
 };
@@ -124,12 +133,15 @@ kernel void main0(const device T* input0 [[buffer(0)]],
     if (x >= param.query_seq_len || y >= param.head_num || z >= param.head_dim) {
         return;
     }
+    int group = param.group;
+    int yin = y / param.group;
+    int yr = y % param.group;
     int qk_seq_len = param.query_seq_len;
     int value_seq_len = param.key_seq_len;
     int head_num = param.head_num;
     int head_dim = param.head_dim;
-    const int offset = head_num * head_dim;
-    const int offset_head = y * head_dim + z;
+    const int stride = head_num * head_dim / group;
+    const int offset_head = yin * head_dim + z;
 #ifdef FOR_PREFILL
     device const T *A_offset = input0 + (y * qk_seq_len + x) * value_seq_len;
     device const T *B_offset = input1 + offset_head;
@@ -138,11 +150,13 @@ kernel void main0(const device T* input0 [[buffer(0)]],
     
     for(int i = 0; i < value_seq_len; ++i){
         float A0 = (float)A_offset[i];
-        float B = (float)B_offset[i*offset];
+        float B = (float)B_offset[i*stride];
         out += A0 * B;
-        Pastvalue_offset[i*offset] = B;
+        if (yr == 0) {
+            Pastvalue_offset[i*stride] = B;
+        }
     }
-    output[ x * offset + (y * head_dim + z)] = out;
+    output[ x * stride * group + (y * head_dim + z)] = out;
 #else
     device const T *A_offset = input0 + y;
     device const T *B_offset = input1 + offset_head;
@@ -151,12 +165,14 @@ kernel void main0(const device T* input0 [[buffer(0)]],
     
     for(int i = 0; i < value_seq_len - 1; ++i){
         float A = (float)A_offset[i * head_num];
-        float B = (float)Pastvalue_offset[i * offset];
+        float B = (float)Pastvalue_offset[i * stride];
         
         out += A * B;
     }
     out += (float)A_offset[(value_seq_len - 1)*head_num] * (float)B_offset[0];
-    Pastvalue_offset[(value_seq_len - 1)*offset] = B_offset[0];
+    if (yr == 0) {
+        Pastvalue_offset[(value_seq_len - 1)*stride] = B_offset[0];
+    }
     output[(y * head_dim + z)] = (T)out;
 #endif
 
@@ -194,7 +210,7 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
     const int mExpandChunk = 64;
     bool mIsDecode = false;
     std::shared_ptr<Tensor> mTempQK, mTempSoftMax;
-    int mNumHead = 0, mHeadDim = 0, mValueH = 0;
+    int mNumHead = 0, mHeadDim = 0, mValueH = 0, mKvNumHead = 0;
     id<MTLComputePipelineState> mKernel_softmax = nil;
     
     id<MTLComputePipelineState> mKernel_qk = nil;
@@ -209,6 +225,7 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
     int query_seq_len;
     int key_seq_len;
     int head_num;
+    int group;
     int head_dim;
     float scale;
 };
@@ -247,13 +264,13 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
     }
     bool needCopy = mCache->mMaxLength > 0;
 
-    size_t old_size = mNumHead * mCache->mMaxLength * mHeadDim * byte;
+    size_t old_size = mKvNumHead * mCache->mMaxLength * mHeadDim * byte;
     mCache->mMaxLength = mCache->mPastLength + mExpandChunk;
     // past_key: [1, numhead, headdim, maxlen]
-    auto new_key = Tensor::createDevice<float>({mCache->mMaxLength, mNumHead, mHeadDim});
+    auto new_key = Tensor::createDevice<float>({mCache->mMaxLength, mKvNumHead, mHeadDim});
     // past_value: [1, numhead, maxlen, headdim]
-    auto new_value = Tensor::createDevice<float>({mCache->mMaxLength, mNumHead, mHeadDim});
-    size_t size = mNumHead * mCache->mMaxLength * mHeadDim * byte;
+    auto new_value = Tensor::createDevice<float>({mCache->mMaxLength, mKvNumHead, mHeadDim});
+    size_t size = mKvNumHead * mCache->mMaxLength * mHeadDim * byte;
     backend()->onAcquireBuffer(new_key, Backend::STATIC);
     backend()->onAcquireBuffer(new_value, Backend::STATIC);
     if (needCopy) {
@@ -356,6 +373,10 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
     if(mIsDecode){
         mCache->mKv_seq_len = mCache->mPastLength + 1;
     }
+    mKvNumHead = key->shape()[2];
+
+    int group_size = mNumHead / mKvNumHead;
+
     reallocKVCache();
 
     // Update Parameters
@@ -365,6 +386,7 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         param->head_dim = mHeadDim;
         param->key_seq_len = mCache->mKv_seq_len;
         param->head_num = mNumHead;
+        param->group = group_size;
         param->query_seq_len = seq_len;
     }
     // For softmax parameter
diff --git a/source/backend/metal/MetalConvolution1x1.mm b/source/backend/metal/MetalConvolution1x1.mm
index 9ca59c59c..cfd9f47b6 100644
--- a/source/backend/metal/MetalConvolution1x1.mm
+++ b/source/backend/metal/MetalConvolution1x1.mm
@@ -70,9 +70,12 @@
     auto oc_4  = UP_DIV(output->channel(), 4);
     auto backend = static_cast<MetalBackend *>(this->backend());
     auto context = (__bridge MNNMetalContext *)backend->context();
-
+    int blockSize = 1;
+    if (mDequantScaleBias.get()) {
+        blockSize = (int)(mDequantScaleBias->usize() /sizeof(float) / oc_4 / 2 / 4);
+    }
     // create const buffer
-    int constants[] = {is, ic_4, ow, oh, os, oc_4, oc, ob, mActivationType};
+    int constants[] = {is, ic_4, ow, oh, os, oc_4, oc, ob, blockSize, mActivationType};
     mConstBuffer = backend->getConstBuffer(sizeof(constants));
     ::memcpy(mConstBuffer.contents, constants, sizeof(constants));
     
diff --git a/source/backend/metal/MetalConvolutionCommon.mm b/source/backend/metal/MetalConvolutionCommon.mm
index 5bdeea2b0..548aae2ef 100644
--- a/source/backend/metal/MetalConvolutionCommon.mm
+++ b/source/backend/metal/MetalConvolutionCommon.mm
@@ -97,35 +97,52 @@ void weightInBlock(int group, int oc, int ic, int kh, int kw, const FType *src,
     }
 }
 
-static std::shared_ptr<MNN::Tensor> getDequantScale(float* scale, int size, MetalBackend *backend, bool asymmetric) {
-    int outputCount = 0;
+static std::shared_ptr<MNN::Tensor> getDequantScale(const float* scale, int size, MetalBackend *backend, bool asymmetric, int oc) {
+    int totalCount = 0;
     if (asymmetric) {
-        outputCount = size / 2;
+        totalCount = size / 2;
     } else {
-        outputCount = size;
+        totalCount = size;
     }
-    int alignOutputCount = ALIGN_UP4(outputCount);
-    std::shared_ptr<MNN::Tensor> dequantScale(MNN::Tensor::createDevice<uint8_t>({(int)(alignOutputCount * sizeof(float) * 2)}));
+    int blockSize = totalCount / oc;
+    int alignOutputCount = ALIGN_UP4(oc);
+    std::shared_ptr<MNN::Tensor> dequantScale(MNN::Tensor::createDevice<uint8_t>({alignOutputCount, blockSize, (int)(sizeof(float) * 2)}));
     bool res = backend->onAcquireBuffer(dequantScale.get(), Backend::STATIC);
     if (!res) {
         MNN_ERROR("Buffer allocated error!\n");
         return nullptr;
     }
     auto buffer0 = MetalBackend::getBuffer(dequantScale.get());
-    auto dst_scale = (uint8_t*)[buffer0.first contents] + buffer0.second;
-    ::memset(dst_scale, 0, alignOutputCount * 2 * sizeof(float));
-    auto dst_bias = dst_scale + alignOutputCount * sizeof(float);
-    for (int o = 0; o < outputCount; ++o) {
-        float min = 0.0f;
-        float alpha = 0.0f;
-        if (asymmetric) {
-            min = scale[2*o];
-            alpha = scale[2*o+1];
-        } else {
-            alpha = scale[o];
+    auto dst_scale = (float*)((uint8_t*)[buffer0.first contents] + buffer0.second);
+    ::memset(dst_scale, 0, dequantScale->usize());
+    if (asymmetric) {
+        for (int z=0; z<oc; ++z) {
+            int zo = z / 4;
+            int zi = z % 4;
+            auto srcZ = scale + z * blockSize * 2;
+            auto dstSZ = dst_scale + zo * blockSize * 8 + zi;
+            auto dstBZ = dst_scale + zo * blockSize * 8 + zi + 4;
+            for (int bi=0; bi<blockSize; ++bi) {
+                float s = srcZ[2*bi+1];
+                float b = srcZ[2*bi+0];
+                dstSZ[bi * 8] = s;
+                dstBZ[bi * 8] = b;
+            }
+        }
+    } else {
+        for (int z=0; z<oc; ++z) {
+            int zo = z / 4;
+            int zi = z % 4;
+            auto srcZ = scale + z * blockSize;
+            auto dstSZ = dst_scale + zo * blockSize * 8 + zi;
+            auto dstBZ = dst_scale + zo * blockSize * 8 + zi + 4;
+            for (int bi=0; bi<blockSize; ++bi) {
+                float s = srcZ[bi];
+                float b = 0.0f;
+                dstSZ[bi * 8] = s;
+                dstBZ[bi * 8] = b;
+            }
         }
-        ((float*)dst_scale)[o] = alpha;
-        ((float*)dst_bias)[o] = min;
     }
     return dequantScale;
 }
@@ -149,7 +166,7 @@ void weightInBlock(int group, int oc, int ic, int kh, int kw, const FType *src,
     if (loadWeightInt8 && qnt->weight.get() != nullptr) {
         auto backend = static_cast<MetalBackend *>(this->backend());
         mWeight = weightTransform(group, oc, ic, kh, kw, (float*)qnt->weight.get(), !qnt->canUseInt4, qnt->canUseInt4);
-        auto dequantParams = getDequantScale(qnt->alpha.get(), qnt->alpha.size(), backend, qnt->asymmetric);
+        auto dequantParams = getDequantScale(qnt->alpha.get(), qnt->alpha.size(), backend, qnt->asymmetric, oc);
         mDequantScaleBias = dequantParams;
         mDequantBits = qnt->canUseInt4 ? 4:8;
     } else if (qnt && qnt->weightFloat.size() > 0) {
diff --git a/source/backend/metal/shader/MetalConvolution1x1.metal b/source/backend/metal/shader/MetalConvolution1x1.metal
index 07b177dd9..21bd0d8d0 100644
--- a/source/backend/metal/shader/MetalConvolution1x1.metal
+++ b/source/backend/metal/shader/MetalConvolution1x1.metal
@@ -10,6 +10,7 @@ struct conv1x1_constants {
     int output_slice;
     int output_channel;
     int batch;
+    int block_size;
     conv_activation_type activation;
 };
 
@@ -67,30 +68,32 @@ kernel void conv1x1_g1z4_w8(const device ftype4 *in            [[buffer(0)]],
     auto biasValue = FLOAT4(biasTerms[uz]);
     FLOAT4 result0 = biasValue, result1 = biasValue, result2 = biasValue, result3 = biasValue;
     int computeSize = min(cst.output_size - rx, CONV_UNROLL);
-    auto scale = FLOAT4(dequantScale[uz]);
-    auto dequant_bias = FLOAT4(dequantScale[uz + cst.output_slice]);
-
-    for (auto z = 0; z < cst.input_slice; z++) {
-        auto in40 = (FLOAT4)*xy_in0;
-        auto in41 = (FLOAT4)*(xy_in0 + 1);
-        auto in42 = (FLOAT4)*(xy_in0 + 2);
-        auto in43 = (FLOAT4)*(xy_in0 + 3);
-        auto w = xy_wt[z];
-        
-        /* weight int8->float */
-        FLOAT4x4 w_fp32 = FLOAT4x4(FLOAT4(w[0]), FLOAT4(w[1]), FLOAT4(w[2]), FLOAT4(w[3]));
-        FLOAT4x4 w_dequant;
-        for (int i = 0; i < 4; ++i) {
-            w_dequant[i] = w_fp32[i] * scale[i] + dequant_bias[i];
+    int block = (cst.input_slice + cst.block_size - 1) / cst.block_size;
+    for (int bi=0; bi<cst.block_size; ++bi) {
+        FLOAT4 bs0 = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 0]);
+        FLOAT4 bs1 = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 1]);
+        FLOAT4 scale = bs0;
+        FLOAT4 dequant_bias = bs1;
+        int zmin = bi * block;
+        int zmax = min(zmin + block, cst.input_slice);
+        for (int z = zmin; z < zmax; z++) {
+            auto in40 = (FLOAT4)*xy_in0;
+            auto in41 = (FLOAT4)*(xy_in0 + 1);
+            auto in42 = (FLOAT4)*(xy_in0 + 2);
+            auto in43 = (FLOAT4)*(xy_in0 + 3);
+            auto w = xy_wt[z];
+            FLOAT4x4 w_fp32 = FLOAT4x4(FLOAT4(w[0]), FLOAT4(w[1]), FLOAT4(w[2]), FLOAT4(w[3]));
+            FLOAT4x4 w_dequant;
+            for (int i = 0; i < 4; ++i) {
+                w_dequant[i] = w_fp32[i] * scale[i] + dequant_bias[i];
+            }
+            result0 += FLOAT4(in40 * w_dequant);
+            result1 += FLOAT4(in41 * w_dequant);
+            result2 += FLOAT4(in42 * w_dequant);
+            result3 += FLOAT4(in43 * w_dequant);
+            xy_in0 += cst.input_size * cst.batch;
         }
-        
-        result0 += FLOAT4(in40 * w_dequant);
-        result1 += FLOAT4(in41 * w_dequant);
-        result2 += FLOAT4(in42 * w_dequant);
-        result3 += FLOAT4(in43 * w_dequant);
-        xy_in0 += cst.input_size * cst.batch;
     }
-    
     /* true */ 
     xy_out[0] = activate(ftype4(result0), cst.activation);
     if (computeSize > 1) {xy_out[1] = activate(ftype4(result1), cst.activation); }
@@ -115,31 +118,35 @@ kernel void conv1x1_g1z4_w4(const device ftype4 *in            [[buffer(0)]],
     auto biasValue = FLOAT4(biasTerms[uz]);
     FLOAT4 result0 = biasValue, result1 = biasValue, result2 = biasValue, result3 = biasValue;
     int computeSize = min(cst.output_size - rx, CONV_UNROLL);
-    auto scale = FLOAT4(dequantScale[uz]);
-    auto dequant_bias = FLOAT4(dequantScale[uz + cst.output_slice]);
-
-    for (auto z = 0; z < cst.input_slice; z++) {
-        auto in40 = (FLOAT4)*xy_in0;
-        auto in41 = (FLOAT4)*(xy_in0 + 1);
-        auto in42 = (FLOAT4)*(xy_in0 + 2);
-        auto in43 = (FLOAT4)*(xy_in0 + 3);
-        MNN::uchar4x2 w_int4 = xy_wt[z];
-        // MNN::char4x4  w_int8(char4(0));
-        /* weight int4->float */
-        //FLOAT4x4 w_fp32 = FLOAT4x4(FLOAT4(w[0]), FLOAT4(w[1]), FLOAT4(w[2]), FLOAT4(w[3]));
-        FLOAT4x4 w_dequant;
-        for (int i = 0; i < 4; ++i) {
-            // ftype4 w4 = ftype4(w_fp32[i]);
-            FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8);
-            FLOAT4 res = w4 * scale[i] + dequant_bias[i];
-            w_dequant[i] = res;
+    int block = (cst.input_slice + cst.block_size - 1) / cst.block_size;
+    for (int bi=0; bi<cst.block_size; ++bi) {
+        FLOAT4 scale = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 0]);
+        FLOAT4 dequant_bias = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 1]);
+        int zmin = bi * block;
+        int zmax = min(zmin + block, cst.input_slice);
+        for (int z = zmin; z < zmax; z++) {
+            auto in40 = (FLOAT4)*xy_in0;
+            auto in41 = (FLOAT4)*(xy_in0 + 1);
+            auto in42 = (FLOAT4)*(xy_in0 + 2);
+            auto in43 = (FLOAT4)*(xy_in0 + 3);
+            MNN::uchar4x2 w_int4 = xy_wt[z];
+            // MNN::char4x4  w_int8(char4(0));
+            /* weight int4->float */
+            //FLOAT4x4 w_fp32 = FLOAT4x4(FLOAT4(w[0]), FLOAT4(w[1]), FLOAT4(w[2]), FLOAT4(w[3]));
+            FLOAT4x4 w_dequant;
+            for (int i = 0; i < 4; ++i) {
+                // ftype4 w4 = ftype4(w_fp32[i]);
+                FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8);
+                FLOAT4 res = w4 * scale[i] + dequant_bias[i];
+                w_dequant[i] = res;
+            }
+
+            result0 += FLOAT4(in40 * w_dequant);
+            result1 += FLOAT4(in41 * w_dequant);
+            result2 += FLOAT4(in42 * w_dequant);
+            result3 += FLOAT4(in43 * w_dequant);
+            xy_in0 += cst.input_size * cst.batch;
         }
-        
-        result0 += FLOAT4(in40 * w_dequant);
-        result1 += FLOAT4(in41 * w_dequant);
-        result2 += FLOAT4(in42 * w_dequant);
-        result3 += FLOAT4(in43 * w_dequant);
-        xy_in0 += cst.input_size * cst.batch;
     }
     
     /* true */ 
diff --git a/source/backend/opencl/execution/cl/opencl_program.cc b/source/backend/opencl/execution/cl/opencl_program.cc
index 36938ef60..7a460a3a0 100644
--- a/source/backend/opencl/execution/cl/opencl_program.cc
+++ b/source/backend/opencl/execution/cl/opencl_program.cc
@@ -207,7 +207,7 @@ extern const std::map<std::string, std::vector<unsigned char>> OpenCLProgramMap
 #ifndef MNN_OPENCL_BUFFER_CLOSED
 {
  "attention_buf", 
-     { 0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4d,0x4e,0x4e,0x5f,0x53,0x55,0x50,0x50,0x4f,0x52,0x54,0x5f,0x46,0x50,0x31,0x36,0xa,0x23,0x70,0x72,0x61,0x67,0x6d,0x61,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x20,0x45,0x58,0x54,0x45,0x4e,0x53,0x49,0x4f,0x4e,0x20,0x63,0x6c,0x5f,0x6b,0x68,0x72,0x5f,0x66,0x70,0x31,0x36,0x20,0x3a,0x20,0x65,0x6e,0x61,0x62,0x6c,0x65,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0xa,0x23,0x64,0x65,0x66,0x69,0x6e,0x65,0x20,0x47,0x4c,0x4f,0x42,0x41,0x4c,0x5f,0x53,0x49,0x5a,0x45,0x5f,0x33,0x5f,0x44,0x49,0x4d,0x53,0x20,0x5c,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x30,0x2c,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x31,0x2c,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x32,0x2c,0xa,0xa,0x23,0x64,0x65,0x66,0x69,0x6e,0x65,0x20,0x44,0x45,0x41,0x4c,0x5f,0x4e,0x4f,0x4e,0x5f,0x55,0x4e,0x49,0x46,0x4f,0x52,0x4d,0x5f,0x44,0x49,0x4d,0x33,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x32,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x33,0x29,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5c,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x3e,0x3d,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x30,0x20,0x7c,0x7c,0x20,0x69,0x6e,0x70,0x75,0x74,0x32,0x20,0x3e,0x3d,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x31,0x20,0x7c,0x7c,0x20,0x69,0x6e,0x70,0x75,0x74,0x33,0x20,0x3e,0x3d,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x32,0x29,0x20,0x7b,0x20,0x5c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5c,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x6d,0x61,0x74,0x6d,0x75,0x6c,0x5f,0x71,0x6b,0x5f,0x64,0x69,0x76,0x5f,0x6d,0x61,0x73,0x6b,0x28,0x47,0x4c,0x4f,0x42,0x41,0x4c,0x5f,0x53,0x49,0x5a,0x45,0x5f,0x33,0x5f,0x44,0x49,0x4d,0x53,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x2f,0x2f,0x20,0x71,0x75,0x65,0x72,0x79,0x20,0x5b,0x31,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x2f,0x2f,0x20,0x6b,0x65,0x79,0x20,0x5b,0x31,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x2f,0x2f,0x20,0x70,0x72,0x65,0x66,0x69,0x6c,0x6c,0x20,0x5b,0x31,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x34,0x5d,0x20,0x20,0x20,0x64,0x65,0x63,0x6f,0x64,0x65,0x5b,0x31,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x70,0x61,0x73,0x74,0x5f,0x6b,0x65,0x79,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x2a,0x20,0x6d,0x61,0x73,0x6b,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x31,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x66,0x6c,0x6f,0x61,0x74,0x20,0x73,0x63,0x61,0x6c,0x65,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x7b,0xa,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x78,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2f,0x20,0x34,0x20,0x66,0x6f,0x72,0x20,0x70,0x72,0x65,0x66,0x69,0x6c,0x6c,0x20,0x20,0x20,0x31,0x20,0x66,0x6f,0x72,0x20,0x64,0x65,0x63,0x6f,0x64,0x65,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x79,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x7a,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x32,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2f,0x20,0x34,0xa,0x20,0x20,0x20,0x20,0x44,0x45,0x41,0x4c,0x5f,0x4e,0x4f,0x4e,0x5f,0x55,0x4e,0x49,0x46,0x4f,0x52,0x4d,0x5f,0x44,0x49,0x4d,0x33,0x28,0x78,0x2c,0x20,0x79,0x2c,0x20,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x20,0x3d,0x20,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x61,0x73,0x74,0x5f,0x6b,0x65,0x79,0x20,0x2b,0x20,0x7a,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x7a,0x34,0x20,0x3d,0x20,0x7a,0x20,0x3c,0x3c,0x20,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x5f,0x50,0x52,0x45,0x46,0x49,0x4c,0x4c,0x5f,0x41,0x54,0x54,0x45,0x4e,0x54,0x49,0x4f,0x4e,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x2b,0x20,0x7a,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x78,0x34,0x20,0x3d,0x20,0x78,0x20,0x3c,0x3c,0x20,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x79,0x20,0x2a,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x3d,0x20,0x28,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x48,0x45,0x41,0x44,0x44,0x49,0x4d,0x5f,0x4c,0x45,0x41,0x56,0x45,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x2d,0x20,0x31,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x30,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x31,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x32,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x33,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x34,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x35,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x36,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x37,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x38,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x39,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x61,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x62,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x63,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x64,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x65,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x66,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x42,0x29,0x2c,0x20,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x28,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x30,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x31,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x32,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x33,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x29,0x2c,0x20,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x30,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x31,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x32,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x33,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x34,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x35,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x36,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x37,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x38,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x39,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x61,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x62,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x63,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x64,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x65,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x66,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x42,0x29,0x2c,0x20,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x30,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x30,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x30,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x30,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x31,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x31,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x31,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x31,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x32,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x32,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x32,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x32,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x33,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x33,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x33,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x33,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x33,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x30,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x20,0x3e,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x31,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x20,0x3e,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x32,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x20,0x3e,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x33,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x3d,0x20,0x28,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x48,0x45,0x41,0x44,0x44,0x49,0x4d,0x5f,0x4c,0x45,0x41,0x56,0x45,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x2d,0x20,0x31,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x30,0x2c,0x20,0x42,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x34,0x2c,0x20,0x42,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x38,0x2c,0x20,0x42,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x63,0x2c,0x20,0x42,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x28,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x30,0x2c,0x20,0x42,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x30,0x2c,0x20,0x42,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x34,0x2c,0x20,0x42,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x38,0x2c,0x20,0x42,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x63,0x2c,0x20,0x42,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x20,0x3d,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x72,0x65,0x6d,0x61,0x69,0x6e,0x20,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x7a,0x20,0x2a,0x20,0x34,0x20,0x2d,0x20,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x3d,0x20,0x72,0x65,0x6d,0x61,0x69,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x74,0x6d,0x70,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x41,0x20,0x3d,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x2a,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x2a,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x42,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x74,0x6d,0x70,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x6f,0x75,0x74,0x5f,0x70,0x74,0x72,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x29,0x26,0x6f,0x75,0x74,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x5f,0x70,0x74,0x72,0x5b,0x72,0x65,0x6d,0x61,0x69,0x6e,0x5d,0x20,0x3d,0x20,0x74,0x6d,0x70,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x79,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x7a,0x34,0x29,0x3b,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x6d,0x61,0x74,0x6d,0x75,0x6c,0x5f,0x71,0x6b,0x76,0x28,0x47,0x4c,0x4f,0x42,0x41,0x4c,0x5f,0x53,0x49,0x5a,0x45,0x5f,0x33,0x5f,0x44,0x49,0x4d,0x53,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x2f,0x2f,0x20,0x71,0x6b,0x20,0x70,0x72,0x65,0x66,0x69,0x6c,0x6c,0x20,0x5b,0x31,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x34,0x5d,0x20,0x20,0x20,0x64,0x65,0x63,0x6f,0x64,0x65,0x5b,0x31,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x2a,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x31,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x70,0x61,0x73,0x74,0x5f,0x76,0x61,0x6c,0x75,0x65,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x7b,0xa,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x78,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x70,0x72,0x65,0x66,0x69,0x6c,0x6c,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2f,0x20,0x34,0x20,0x20,0x20,0x64,0x65,0x63,0x6f,0x64,0x65,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x79,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x7a,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x32,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x3c,0x3c,0x20,0x32,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x7a,0x34,0x20,0x3d,0x20,0x7a,0x20,0x3c,0x3c,0x20,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x44,0x45,0x41,0x4c,0x5f,0x4e,0x4f,0x4e,0x5f,0x55,0x4e,0x49,0x46,0x4f,0x52,0x4d,0x5f,0x44,0x49,0x4d,0x33,0x28,0x78,0x2c,0x20,0x79,0x2c,0x20,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x5f,0x50,0x52,0x45,0x46,0x49,0x4c,0x4c,0x5f,0x41,0x54,0x54,0x45,0x4e,0x54,0x49,0x4f,0x4e,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x20,0x3d,0x20,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2b,0x20,0x78,0x29,0x20,0x2a,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x61,0x73,0x74,0x5f,0x76,0x61,0x6c,0x75,0x65,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x69,0x6e,0x64,0x65,0x78,0x20,0x3d,0x20,0x69,0x20,0x3c,0x3c,0x20,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x30,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x6e,0x64,0x65,0x78,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x31,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x31,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x32,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x32,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x33,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x33,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x69,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x30,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x31,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x31,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x32,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x32,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x33,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x34,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x31,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x35,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x32,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x36,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x37,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x38,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x31,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x39,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x32,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x61,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x62,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x63,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x31,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x64,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x32,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x65,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x66,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x42,0x29,0x2c,0x20,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x69,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x48,0x45,0x41,0x44,0x44,0x49,0x4d,0x5f,0x4c,0x45,0x41,0x56,0x45,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x70,0x74,0x72,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x29,0x26,0x42,0x3b,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x2c,0x20,0x6a,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x3b,0x20,0x2b,0x2b,0x69,0x2c,0x20,0x2b,0x2b,0x6a,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x30,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x34,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x38,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x31,0x32,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x30,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x2e,0x73,0x30,0x31,0x32,0x33,0x29,0x2c,0x20,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x31,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x2e,0x73,0x34,0x35,0x36,0x37,0x29,0x2c,0x20,0x31,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x32,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x2e,0x73,0x38,0x39,0x61,0x62,0x29,0x2c,0x20,0x32,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x33,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x2e,0x73,0x63,0x64,0x65,0x66,0x29,0x2c,0x20,0x33,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x42,0x29,0x2c,0x20,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x70,0x74,0x72,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x29,0x26,0x42,0x3b,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x2c,0x20,0x6a,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x3b,0x20,0x2b,0x2b,0x69,0x2c,0x20,0x2b,0x2b,0x6a,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x30,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x34,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x38,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x31,0x32,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x29,0x28,0x6f,0x75,0x74,0x30,0x2c,0x20,0x6f,0x75,0x74,0x31,0x2c,0x20,0x6f,0x75,0x74,0x32,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x20,0x3d,0x20,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6c,0x6f,0x6f,0x70,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x32,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x20,0x2b,0x20,0x79,0x20,0x2a,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x61,0x73,0x74,0x5f,0x76,0x61,0x6c,0x75,0x65,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x6c,0x6f,0x6f,0x70,0x20,0x2d,0x20,0x31,0x3b,0x20,0x69,0x2b,0x2b,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x69,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x64,0x6f,0x74,0x28,0x41,0x2c,0x20,0x42,0x2e,0x73,0x30,0x31,0x32,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x64,0x6f,0x74,0x28,0x41,0x2c,0x20,0x42,0x2e,0x73,0x34,0x35,0x36,0x37,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x64,0x6f,0x74,0x28,0x41,0x2c,0x20,0x42,0x2e,0x73,0x38,0x39,0x61,0x62,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x64,0x6f,0x74,0x28,0x41,0x2c,0x20,0x42,0x2e,0x73,0x63,0x64,0x65,0x66,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x73,0x74,0x61,0x72,0x74,0x20,0x3d,0x20,0x28,0x6c,0x6f,0x6f,0x70,0x20,0x2d,0x20,0x31,0x29,0x20,0x3c,0x20,0x30,0x20,0x3f,0x20,0x30,0x20,0x3a,0x20,0x28,0x6c,0x6f,0x6f,0x70,0x20,0x2d,0x20,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x5f,0x56,0x65,0x63,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x73,0x74,0x61,0x72,0x74,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x70,0x74,0x72,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x29,0x26,0x42,0x5f,0x56,0x65,0x63,0x3b,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x73,0x74,0x61,0x72,0x74,0x20,0x2a,0x20,0x34,0x3b,0x20,0x69,0x20,0x3c,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x31,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x41,0x20,0x3d,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x69,0x6e,0x64,0x65,0x78,0x20,0x3d,0x20,0x69,0x20,0x25,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x5f,0x70,0x74,0x72,0x5b,0x69,0x6e,0x64,0x65,0x78,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x5f,0x70,0x74,0x72,0x5b,0x69,0x6e,0x64,0x65,0x78,0x2b,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x5f,0x70,0x74,0x72,0x5b,0x69,0x6e,0x64,0x65,0x78,0x2b,0x38,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x5f,0x70,0x74,0x72,0x5b,0x69,0x6e,0x64,0x65,0x78,0x2b,0x31,0x32,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x41,0x20,0x3d,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x31,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x30,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x30,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x31,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x32,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x38,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x33,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x31,0x32,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x69,0x6e,0x64,0x65,0x78,0x20,0x3d,0x20,0x28,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x31,0x29,0x20,0x3e,0x3e,0x20,0x32,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x31,0x29,0x20,0x25,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x48,0x45,0x41,0x44,0x44,0x49,0x4d,0x5f,0x4c,0x45,0x41,0x56,0x45,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x5d,0x20,0x3d,0x20,0x42,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x34,0x5d,0x20,0x3d,0x20,0x42,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x38,0x5d,0x20,0x3d,0x20,0x42,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x31,0x32,0x5d,0x20,0x3d,0x20,0x42,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x3b,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x5d,0x20,0x3d,0x20,0x42,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x34,0x5d,0x20,0x3d,0x20,0x42,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x38,0x5d,0x20,0x3d,0x20,0x42,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x31,0x32,0x5d,0x20,0x3d,0x20,0x42,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x3b,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x20,0x20,0x20,0x20,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x7d,0xa,0xa, } 
+     { 0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4d,0x4e,0x4e,0x5f,0x53,0x55,0x50,0x50,0x4f,0x52,0x54,0x5f,0x46,0x50,0x31,0x36,0xa,0x23,0x70,0x72,0x61,0x67,0x6d,0x61,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x20,0x45,0x58,0x54,0x45,0x4e,0x53,0x49,0x4f,0x4e,0x20,0x63,0x6c,0x5f,0x6b,0x68,0x72,0x5f,0x66,0x70,0x31,0x36,0x20,0x3a,0x20,0x65,0x6e,0x61,0x62,0x6c,0x65,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0xa,0x23,0x64,0x65,0x66,0x69,0x6e,0x65,0x20,0x47,0x4c,0x4f,0x42,0x41,0x4c,0x5f,0x53,0x49,0x5a,0x45,0x5f,0x33,0x5f,0x44,0x49,0x4d,0x53,0x20,0x5c,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x30,0x2c,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x31,0x2c,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x32,0x2c,0xa,0xa,0x23,0x64,0x65,0x66,0x69,0x6e,0x65,0x20,0x44,0x45,0x41,0x4c,0x5f,0x4e,0x4f,0x4e,0x5f,0x55,0x4e,0x49,0x46,0x4f,0x52,0x4d,0x5f,0x44,0x49,0x4d,0x33,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x32,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x33,0x29,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5c,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x3e,0x3d,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x30,0x20,0x7c,0x7c,0x20,0x69,0x6e,0x70,0x75,0x74,0x32,0x20,0x3e,0x3d,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x31,0x20,0x7c,0x7c,0x20,0x69,0x6e,0x70,0x75,0x74,0x33,0x20,0x3e,0x3d,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x32,0x29,0x20,0x7b,0x20,0x5c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5c,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x6d,0x61,0x74,0x6d,0x75,0x6c,0x5f,0x71,0x6b,0x5f,0x64,0x69,0x76,0x5f,0x6d,0x61,0x73,0x6b,0x28,0x47,0x4c,0x4f,0x42,0x41,0x4c,0x5f,0x53,0x49,0x5a,0x45,0x5f,0x33,0x5f,0x44,0x49,0x4d,0x53,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x2f,0x2f,0x20,0x71,0x75,0x65,0x72,0x79,0x20,0x5b,0x31,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x2f,0x2f,0x20,0x6b,0x65,0x79,0x20,0x5b,0x31,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x2f,0x2f,0x20,0x70,0x72,0x65,0x66,0x69,0x6c,0x6c,0x20,0x5b,0x31,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x34,0x5d,0x20,0x20,0x20,0x64,0x65,0x63,0x6f,0x64,0x65,0x5b,0x31,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x70,0x61,0x73,0x74,0x5f,0x6b,0x65,0x79,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x41,0x44,0x44,0x5f,0x4d,0x41,0x53,0x4b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x6d,0x61,0x73,0x6b,0x2c,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x2a,0x20,0x6d,0x61,0x73,0x6b,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x31,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x34,0x5d,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x66,0x6c,0x6f,0x61,0x74,0x20,0x73,0x63,0x61,0x6c,0x65,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x7b,0xa,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x78,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2f,0x20,0x34,0x20,0x66,0x6f,0x72,0x20,0x70,0x72,0x65,0x66,0x69,0x6c,0x6c,0x20,0x20,0x20,0x31,0x20,0x66,0x6f,0x72,0x20,0x64,0x65,0x63,0x6f,0x64,0x65,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x79,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x7a,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x32,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2f,0x20,0x34,0xa,0x20,0x20,0x20,0x20,0x44,0x45,0x41,0x4c,0x5f,0x4e,0x4f,0x4e,0x5f,0x55,0x4e,0x49,0x46,0x4f,0x52,0x4d,0x5f,0x44,0x49,0x4d,0x33,0x28,0x78,0x2c,0x20,0x79,0x2c,0x20,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x20,0x3d,0x20,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x61,0x73,0x74,0x5f,0x6b,0x65,0x79,0x20,0x2b,0x20,0x7a,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x7a,0x34,0x20,0x3d,0x20,0x7a,0x20,0x3c,0x3c,0x20,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x5f,0x50,0x52,0x45,0x46,0x49,0x4c,0x4c,0x5f,0x41,0x54,0x54,0x45,0x4e,0x54,0x49,0x4f,0x4e,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x2b,0x20,0x7a,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x78,0x34,0x20,0x3d,0x20,0x78,0x20,0x3c,0x3c,0x20,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x79,0x20,0x2a,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x3d,0x20,0x28,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x48,0x45,0x41,0x44,0x44,0x49,0x4d,0x5f,0x4c,0x45,0x41,0x56,0x45,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x2d,0x20,0x31,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x30,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x31,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x32,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x33,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x34,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x35,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x36,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x37,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x38,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x39,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x61,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x62,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x63,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x64,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x65,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x66,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x42,0x29,0x2c,0x20,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x28,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x30,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x31,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x32,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x33,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x29,0x2c,0x20,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x30,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x31,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x32,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x33,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x34,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x35,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x36,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x37,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x38,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x39,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x61,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x62,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x63,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x64,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x65,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x66,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x42,0x29,0x2c,0x20,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x41,0x44,0x44,0x5f,0x4d,0x41,0x53,0x4b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x30,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x30,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x30,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x30,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x31,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x31,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x31,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x31,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x32,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x32,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x32,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x32,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x33,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x33,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x33,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x33,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x33,0x3b,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x30,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x20,0x3e,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x31,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x20,0x3e,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x32,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x20,0x3e,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x33,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x3d,0x20,0x28,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x48,0x45,0x41,0x44,0x44,0x49,0x4d,0x5f,0x4c,0x45,0x41,0x56,0x45,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x2d,0x20,0x31,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x30,0x2c,0x20,0x42,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x34,0x2c,0x20,0x42,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x38,0x2c,0x20,0x42,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x63,0x2c,0x20,0x42,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x28,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x30,0x2c,0x20,0x42,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x30,0x2c,0x20,0x42,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x34,0x2c,0x20,0x42,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x38,0x2c,0x20,0x42,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x63,0x2c,0x20,0x42,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x20,0x3d,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x72,0x65,0x6d,0x61,0x69,0x6e,0x20,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x7a,0x20,0x2a,0x20,0x34,0x20,0x2d,0x20,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x3d,0x20,0x72,0x65,0x6d,0x61,0x69,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x74,0x6d,0x70,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x41,0x20,0x3d,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x2a,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x2a,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x42,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x74,0x6d,0x70,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x6f,0x75,0x74,0x5f,0x70,0x74,0x72,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x29,0x26,0x6f,0x75,0x74,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x5f,0x70,0x74,0x72,0x5b,0x72,0x65,0x6d,0x61,0x69,0x6e,0x5d,0x20,0x3d,0x20,0x74,0x6d,0x70,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x79,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x7a,0x34,0x29,0x3b,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x6d,0x61,0x74,0x6d,0x75,0x6c,0x5f,0x71,0x6b,0x76,0x28,0x47,0x4c,0x4f,0x42,0x41,0x4c,0x5f,0x53,0x49,0x5a,0x45,0x5f,0x33,0x5f,0x44,0x49,0x4d,0x53,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x2f,0x2f,0x20,0x71,0x6b,0x20,0x70,0x72,0x65,0x66,0x69,0x6c,0x6c,0x20,0x5b,0x31,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x34,0x5d,0x20,0x20,0x20,0x64,0x65,0x63,0x6f,0x64,0x65,0x5b,0x31,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x2a,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x31,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x70,0x61,0x73,0x74,0x5f,0x76,0x61,0x6c,0x75,0x65,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x7b,0xa,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x78,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x70,0x72,0x65,0x66,0x69,0x6c,0x6c,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2f,0x20,0x34,0x20,0x20,0x20,0x64,0x65,0x63,0x6f,0x64,0x65,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x79,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x7a,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x32,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x3c,0x3c,0x20,0x32,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x7a,0x34,0x20,0x3d,0x20,0x7a,0x20,0x3c,0x3c,0x20,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x44,0x45,0x41,0x4c,0x5f,0x4e,0x4f,0x4e,0x5f,0x55,0x4e,0x49,0x46,0x4f,0x52,0x4d,0x5f,0x44,0x49,0x4d,0x33,0x28,0x78,0x2c,0x20,0x79,0x2c,0x20,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x5f,0x50,0x52,0x45,0x46,0x49,0x4c,0x4c,0x5f,0x41,0x54,0x54,0x45,0x4e,0x54,0x49,0x4f,0x4e,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x20,0x3d,0x20,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2b,0x20,0x78,0x29,0x20,0x2a,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x61,0x73,0x74,0x5f,0x76,0x61,0x6c,0x75,0x65,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x69,0x6e,0x64,0x65,0x78,0x20,0x3d,0x20,0x69,0x20,0x3c,0x3c,0x20,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x30,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x6e,0x64,0x65,0x78,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x31,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x31,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x32,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x32,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x33,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x33,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x69,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x30,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x31,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x31,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x32,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x32,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x33,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x34,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x31,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x35,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x32,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x36,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x37,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x38,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x31,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x39,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x32,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x61,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x62,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x63,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x31,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x64,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x32,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x65,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x66,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x42,0x29,0x2c,0x20,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x69,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x48,0x45,0x41,0x44,0x44,0x49,0x4d,0x5f,0x4c,0x45,0x41,0x56,0x45,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x70,0x74,0x72,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x29,0x26,0x42,0x3b,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x2c,0x20,0x6a,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x3b,0x20,0x2b,0x2b,0x69,0x2c,0x20,0x2b,0x2b,0x6a,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x30,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x34,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x38,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x31,0x32,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x30,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x2e,0x73,0x30,0x31,0x32,0x33,0x29,0x2c,0x20,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x31,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x2e,0x73,0x34,0x35,0x36,0x37,0x29,0x2c,0x20,0x31,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x32,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x2e,0x73,0x38,0x39,0x61,0x62,0x29,0x2c,0x20,0x32,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x33,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x2e,0x73,0x63,0x64,0x65,0x66,0x29,0x2c,0x20,0x33,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x42,0x29,0x2c,0x20,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x70,0x74,0x72,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x29,0x26,0x42,0x3b,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x2c,0x20,0x6a,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x3b,0x20,0x2b,0x2b,0x69,0x2c,0x20,0x2b,0x2b,0x6a,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x30,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x34,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x38,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x31,0x32,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x29,0x28,0x6f,0x75,0x74,0x30,0x2c,0x20,0x6f,0x75,0x74,0x31,0x2c,0x20,0x6f,0x75,0x74,0x32,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x20,0x3d,0x20,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6c,0x6f,0x6f,0x70,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x32,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x20,0x2b,0x20,0x79,0x20,0x2a,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x61,0x73,0x74,0x5f,0x76,0x61,0x6c,0x75,0x65,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x6c,0x6f,0x6f,0x70,0x20,0x2d,0x20,0x31,0x3b,0x20,0x69,0x2b,0x2b,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x69,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x64,0x6f,0x74,0x28,0x41,0x2c,0x20,0x42,0x2e,0x73,0x30,0x31,0x32,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x64,0x6f,0x74,0x28,0x41,0x2c,0x20,0x42,0x2e,0x73,0x34,0x35,0x36,0x37,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x64,0x6f,0x74,0x28,0x41,0x2c,0x20,0x42,0x2e,0x73,0x38,0x39,0x61,0x62,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x64,0x6f,0x74,0x28,0x41,0x2c,0x20,0x42,0x2e,0x73,0x63,0x64,0x65,0x66,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x73,0x74,0x61,0x72,0x74,0x20,0x3d,0x20,0x28,0x6c,0x6f,0x6f,0x70,0x20,0x2d,0x20,0x31,0x29,0x20,0x3c,0x20,0x30,0x20,0x3f,0x20,0x30,0x20,0x3a,0x20,0x28,0x6c,0x6f,0x6f,0x70,0x20,0x2d,0x20,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x5f,0x56,0x65,0x63,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x73,0x74,0x61,0x72,0x74,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x70,0x74,0x72,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x29,0x26,0x42,0x5f,0x56,0x65,0x63,0x3b,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x73,0x74,0x61,0x72,0x74,0x20,0x2a,0x20,0x34,0x3b,0x20,0x69,0x20,0x3c,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x31,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x41,0x20,0x3d,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x69,0x6e,0x64,0x65,0x78,0x20,0x3d,0x20,0x69,0x20,0x25,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x5f,0x70,0x74,0x72,0x5b,0x69,0x6e,0x64,0x65,0x78,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x5f,0x70,0x74,0x72,0x5b,0x69,0x6e,0x64,0x65,0x78,0x2b,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x5f,0x70,0x74,0x72,0x5b,0x69,0x6e,0x64,0x65,0x78,0x2b,0x38,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x5f,0x70,0x74,0x72,0x5b,0x69,0x6e,0x64,0x65,0x78,0x2b,0x31,0x32,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x41,0x20,0x3d,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x31,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x30,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x30,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x31,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x32,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x38,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x33,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x31,0x32,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x69,0x6e,0x64,0x65,0x78,0x20,0x3d,0x20,0x28,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x31,0x29,0x20,0x3e,0x3e,0x20,0x32,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x31,0x29,0x20,0x25,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x48,0x45,0x41,0x44,0x44,0x49,0x4d,0x5f,0x4c,0x45,0x41,0x56,0x45,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x5d,0x20,0x3d,0x20,0x42,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x34,0x5d,0x20,0x3d,0x20,0x42,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x38,0x5d,0x20,0x3d,0x20,0x42,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x31,0x32,0x5d,0x20,0x3d,0x20,0x42,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x3b,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x5d,0x20,0x3d,0x20,0x42,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x34,0x5d,0x20,0x3d,0x20,0x42,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x38,0x5d,0x20,0x3d,0x20,0x42,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x31,0x32,0x5d,0x20,0x3d,0x20,0x42,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x3b,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x20,0x20,0x20,0x20,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x7d,0xa,0xa, } 
  }, 
 #endif
 #ifndef MNN_OPENCL_BUFFER_CLOSED
diff --git a/source/core/ConvolutionCommon.cpp b/source/core/ConvolutionCommon.cpp
index 9070fe9c7..6a333f0fa 100644
--- a/source/core/ConvolutionCommon.cpp
+++ b/source/core/ConvolutionCommon.cpp
@@ -53,6 +53,7 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const Con
     size_t weightLength = 0;
     int8_t *buffer        = nullptr;
     auto originBuffer     = (unsigned char *)buffer_ptr;
+
     if (1 == quan->type()) {
         buffer = IDSTDecoder::ReadQuanData_c(originBuffer, &weightLength, result.get(), quan->shapeInt32());
     }
diff --git a/source/core/IDSTEncoder.hpp b/source/core/IDSTEncoder.hpp
index 2d24b56b5..4c72f2d29 100644
--- a/source/core/IDSTEncoder.hpp
+++ b/source/core/IDSTEncoder.hpp
@@ -421,32 +421,44 @@ static bool WriteSparseQuanBlobs(std::ostream &out, const float* weightData, con
 
 static std::unique_ptr<IDSTQuanT> encode(const float* weight, const std::vector<float>& scale, int kernelSize, int kernelNum,
                                          bool asymmetricQuantFlag, const int8_t* quantWeightPtr, const int clampMin, const int bits = 8, bool detectSparse = true) {
-    std::ostringstream outputStringStreamCQ, outputStringStreamSQ;
-    bool shapeUseInt32 = false;
-    WriteCQBlobs(outputStringStreamCQ, weight, scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32, bits);
-    bool sparseValid = false;
-    if (detectSparse) {
-        sparseValid = WriteSparseQuanBlobs(outputStringStreamSQ, weight, scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32, bits);
+        // compute block_size
+
+    int alpha_size = scale.size(), block_size = kernelSize, block_num = 1;
+    if (asymmetricQuantFlag) alpha_size /= 2;
+    if (alpha_size > kernelNum) {
+        block_num = alpha_size / kernelNum;
+        block_size = kernelSize / block_num;
     }
+    bool shapeUseInt32 = false;
     std::unique_ptr<IDSTQuanT> idst(new IDSTQuanT);
+    std::ostringstream outputStringStreamCQ;
+    WriteCQBlobs(outputStringStreamCQ, weight, scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32, bits);
     auto cqStr = outputStringStreamCQ.str();
-    auto sqStr = outputStringStreamSQ.str();
-    int int8Size = kernelNum * kernelSize;
-    idst->shapeInt32 = shapeUseInt32;
-    if (quantWeightPtr && (int8Size <= cqStr.size() && int8Size <= sqStr.size())) {
-        idst->type = 4;
-        idst->aMax = kernelNum;
-        idst->buffer.resize(int8Size);
-        ::memcpy(idst->buffer.data(), quantWeightPtr, int8Size);
-    } else if (cqStr.size() <= sqStr.size() || (!sparseValid)) {
+    if (detectSparse) {
+        std::ostringstream outputStringStreamSQ;
+        bool sparseValid = WriteSparseQuanBlobs(outputStringStreamSQ, weight, scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32, bits);
+        auto sqStr = outputStringStreamSQ.str();
+        int int8Size = kernelNum * kernelSize;
+        if (quantWeightPtr && (int8Size <= cqStr.size() && int8Size <= sqStr.size())) {
+            idst->type = 4;
+            idst->aMax = kernelNum;
+            idst->buffer.resize(int8Size);
+            ::memcpy(idst->buffer.data(), quantWeightPtr, int8Size);
+        } else if (cqStr.size() <= sqStr.size() || (!sparseValid)) {
+            idst->type = 1;
+            idst->buffer.resize(cqStr.size());
+            ::memcpy(idst->buffer.data(), cqStr.data(), cqStr.size());
+        } else {
+            idst->type = 2;
+            idst->buffer.resize(sqStr.size());
+            ::memcpy(idst->buffer.data(), sqStr.data(), sqStr.size());
+        }
+    } else {
         idst->type = 1;
         idst->buffer.resize(cqStr.size());
         ::memcpy(idst->buffer.data(), cqStr.data(), cqStr.size());
-    } else {
-        idst->type = 2;
-        idst->buffer.resize(sqStr.size());
-        ::memcpy(idst->buffer.data(), sqStr.data(), sqStr.size());
     }
+    idst->shapeInt32 = shapeUseInt32;
     idst->alpha.resize(scale.size());
     ::memcpy(idst->alpha.data(), scale.data(), scale.size() * sizeof(float));
     idst->quantScale = 1.f;
diff --git a/source/core/Interpreter.cpp b/source/core/Interpreter.cpp
index e9ec694cb..5078d1493 100644
--- a/source/core/Interpreter.cpp
+++ b/source/core/Interpreter.cpp
@@ -144,36 +144,11 @@ Interpreter* Interpreter::createFromBufferInternal(Content* net, bool enforceAut
 }
 
 void Interpreter::setSessionHint(HintMode mode, int hint) {
-    switch (mode) {
-        case MAX_TUNING_NUMBER:
-            mNet->modes.maxTuningNumber = hint;
-            break;
-        case MEM_ALLOCATOR_TYPE:
-            mNet->modes.memoryAllocatorType = hint;
-            break;
-        case WINOGRAD_MEMORY_LEVEL:
-            mNet->modes.winogradMemoryUsed = hint;
-        default:
-            break;
-    }
+    mNet->modes.setHint(mode, hint);
 }
 
 void Interpreter::setSessionMode(SessionMode mode) {
-    if (mode == Session_Input_Inside || mode == Session_Input_User) {
-        mNet->modes.inputMode = mode;
-    } else if (mode == Session_Output_User || mode == Session_Output_Inside) {
-        mNet->modes.outputMode = mode;
-    } else if (mode == Session_Backend_Auto || mode == Session_Backend_Fix) {
-        mNet->modes.backendMode = mode;
-    } else if (mode == Session_Debug || mode == Session_Release) {
-        mNet->modes.callBackMode = mode;
-    } else if (mode == Session_Resize_Direct || mode == Session_Resize_Defer) {
-        mNet->modes.resizeMode = mode;
-    } else if(mode == Session_Memory_Collect || mode == Session_Memory_Cache) {
-        mNet->modes.memoryUsageMode = mode;
-    } else if(mode == Session_Codegen_Disable || mode == Session_Codegen_Enable) {
-        mNet->modes.codegenMode = mode;
-    } else if (mode == Session_Resize_Check) {
+    if (mode == Session_Resize_Check) {
         for (auto& iter : mNet->sessions) {
             iter->openResizeCheck();
         }
@@ -181,6 +156,8 @@ void Interpreter::setSessionMode(SessionMode mode) {
         for (auto& iter : mNet->sessions) {
             iter->fixResizeCache();
         }
+    } else {
+        mNet->modes.setMode(mode);
     }
 }
 
diff --git a/source/core/Pipeline.cpp b/source/core/Pipeline.cpp
index f2e030e5a..7033108c3 100644
--- a/source/core/Pipeline.cpp
+++ b/source/core/Pipeline.cpp
@@ -205,9 +205,9 @@ void Pipeline::UnitInfo::setUp(const Command& command, int index, const Op* orig
 #endif
 }
 
-Pipeline::Pipeline(const std::string& externalFile, Schedule::PipelineInfo&& info, bool allocInput, bool outputStatic, const TuningAttr& tune, const Runtime* rt, const Runtime* cpuRt)
+Pipeline::Pipeline(const std::string& externalFile, Schedule::PipelineInfo&& info, bool allocInput, bool outputStatic, const TuningAttr& tune, const Runtime* rt, const Runtime* cpuRt, int geometryMask)
 #ifndef MNN_BUILD_MINI
-    : mContext(info.first.cache.second, info.first.cache.first->type(), info.first.info.user ? info.first.info.user->precision :  BackendConfig::Precision_Normal), mUseGeometry(rt->onGetCompilerType()) {
+    : mContext(geometryMask, info.first.cache.second, info.first.cache.first->type(), info.first.info.user ? info.first.info.user->precision :  BackendConfig::Precision_Normal), mUseGeometry(rt->onGetCompilerType()) {
 #else
 {
 #endif
diff --git a/source/core/Pipeline.hpp b/source/core/Pipeline.hpp
index c32701db5..6fb9543d3 100644
--- a/source/core/Pipeline.hpp
+++ b/source/core/Pipeline.hpp
@@ -27,7 +27,7 @@ class Pipeline : public NonCopyable {
         bool autoSetOpType;
         int maxTuningNumber;
     };
-    Pipeline(const std::string& externalFile, Schedule::PipelineInfo&& info, bool allocInput, bool outputStatic, const TuningAttr& tune, const Runtime* rt, const Runtime* cpuRt);
+    Pipeline(const std::string& externalFile, Schedule::PipelineInfo&& info, bool allocInput, bool outputStatic, const TuningAttr& tune, const Runtime* rt, const Runtime* cpuRt, int geometryMask);
     ~Pipeline();
     ErrorCode fixResizeCache();
     void openResizeCheck();
diff --git a/source/core/Session.cpp b/source/core/Session.cpp
index 5998c9253..9ab6b460c 100644
--- a/source/core/Session.cpp
+++ b/source/core/Session.cpp
@@ -44,6 +44,44 @@ static void _createPipelineBackend(Schedule::PipelineInfo& iter, RuntimeInfo& ru
         iter.first.cache.second.reset(cpuRuntime->onCreate(&defaultConfig));
     }
 }
+void Session::ModeGroup::setMode(Interpreter::SessionMode mode) {
+    if (mode == Interpreter::Session_Input_Inside || mode == Interpreter::Session_Input_User) {
+        inputMode = mode;
+    } else if (mode == Interpreter::Session_Output_User || mode == Interpreter::Session_Output_Inside) {
+        outputMode = mode;
+    } else if (mode == Interpreter::Session_Backend_Auto || mode == Interpreter::Session_Backend_Fix) {
+        backendMode = mode;
+    } else if (mode == Interpreter::Session_Debug || mode == Interpreter::Session_Release) {
+        callBackMode = mode;
+    } else if (mode == Interpreter::Session_Resize_Direct || mode == Interpreter::Session_Resize_Defer) {
+        resizeMode = mode;
+    } else if(mode == Interpreter::Session_Memory_Collect || mode == Interpreter::Session_Memory_Cache) {
+        memoryUsageMode = mode;
+    } else if(mode == Interpreter::Session_Codegen_Disable || mode == Interpreter::Session_Codegen_Enable) {
+        codegenMode = mode;
+    }
+}
+void Session::ModeGroup::setHint(Interpreter::HintMode mode, int hint) {
+    switch (mode) {
+        case Interpreter::MAX_TUNING_NUMBER:
+            maxTuningNumber = hint;
+            break;
+        case Interpreter::MEM_ALLOCATOR_TYPE:
+            memoryAllocatorType = hint;
+            break;
+        case Interpreter::WINOGRAD_MEMORY_LEVEL:
+            winogradMemoryUsed = hint;
+            break;
+        case Interpreter::GEOMETRY_COMPUTE_MASK:
+            geometryMask = hint;
+            break;
+        case Interpreter::STRICT_CHECK_MODEL:
+            checkNetBuffer = hint > 0;
+            break;
+        default:
+            break;
+    }
+}
 Session::Session(Schedule::ScheduleInfo&& info, const ModeGroup& mode, RuntimeInfo&& runtime) {
     mMode = mode;
     mRuntime = std::move(runtime);
@@ -59,7 +97,7 @@ Session::Session(Schedule::ScheduleInfo&& info, const ModeGroup& mode, RuntimeIn
         attr.autoSetOpType = mode.backendMode == Interpreter::Session_Backend_Auto;
         auto rt    = mRuntime.first.find(iter.first.info.type)->second.get();
         auto cpuRuntime = mRuntime.second;
-        std::shared_ptr<Pipeline> newPipeline(new Pipeline(mInfo.externalWeightPath, std::move(iter), mode.inputMode == Interpreter::Session_Input_Inside, mode.outputMode == Interpreter::Session_Output_User, attr, rt, cpuRuntime.get()));
+        std::shared_ptr<Pipeline> newPipeline(new Pipeline( mInfo.externalWeightPath, std::move(iter), mode.inputMode == Interpreter::Session_Input_Inside, mode.outputMode == Interpreter::Session_Output_User, attr, rt, cpuRuntime.get(), mMode.geometryMask));
         mPipelines.emplace_back(std::move(newPipeline));
     }
     mCallBackMode = mode.callBackMode;
diff --git a/source/core/Session.hpp b/source/core/Session.hpp
index 7a1ba8963..7b3ac7caf 100644
--- a/source/core/Session.hpp
+++ b/source/core/Session.hpp
@@ -36,6 +36,10 @@ class MNN_PUBLIC Session {
         int memoryAllocatorType = 0;
         int maxTuningNumber = MNN_DEFAULT_TUNING_NUMBER;
         int winogradMemoryUsed = 3;
+        int geometryMask = 0xFFFF;
+        bool checkNetBuffer = true;
+        void setHint(Interpreter::HintMode hint, int magic);
+        void setMode(Interpreter::SessionMode mode);
     };
     Session(Schedule::ScheduleInfo&& info, const ModeGroup& mode,
             RuntimeInfo&& runtime);
diff --git a/source/core/TensorUtils.cpp b/source/core/TensorUtils.cpp
index 3a0823a4a..587d0c170 100644
--- a/source/core/TensorUtils.cpp
+++ b/source/core/TensorUtils.cpp
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <cmath>
 #include <cstring>
+#include <algorithm>
 #include "core/Backend.hpp"
 #include "core/Macro.h"
 namespace MNN {
@@ -403,21 +404,22 @@ bool TensorUtils::isDepthToSpaceRegions(const Tensor* output) {
 }
 
 // compute offset through region
-static inline int offsetCompute(const Tensor::InsideDescribe::Region& reg, int offset, bool backward) {
-    Tensor::InsideDescribe::View src;
-    Tensor::InsideDescribe::View dst;
+static inline int offsetCompute(const Tensor::InsideDescribe::Region& reg, int srcOffset, int dstOffset, bool backward) {
+    const Tensor::InsideDescribe::View* src;
+    const Tensor::InsideDescribe::View* dst;
     if (backward) {
-        src = reg.dst;
-        dst = reg.src;
+        src = &reg.dst;
+        dst = &reg.src;
     } else {
-        src = reg.src;
-        dst = reg.dst;
+        src = &reg.src;
+        dst = &reg.dst;
     }
     int res = 0;
     for (int i = 0; i < 3; i++) {
         if (reg.size[i] > 1) {
-            res += offset / src.stride[i] * dst.stride[i];
-            offset %= src.stride[i];
+            res += (srcOffset / src->stride[i] - dstOffset / src->stride[i]) * dst->stride[i];
+            srcOffset %= src->stride[i];
+            dstOffset %= src->stride[i];
         }
     }
     return res;
@@ -473,6 +475,75 @@ bool TensorUtils::refTensorContent(Tensor* dst, const Tensor* src) {
     return needMalloc;
 }
 
+static bool _ClipDst(int* stride, int srcOffset, int dstOffset, const int* srcSize, const int* dstSize, const int sizeNum, int* dstMax, int* dstMin) {
+    /* Compute The range of dx, dy, dz:
+     s0 * (dx-sx) + s1 * (dy-sy) + s2 * (dz-sz) + (doff-soff) = 0
+     Assume the region won't be overlapped, then extract doff -> s0*xd+ s1*yd+s2*zd, soff -> s0*xs+s1*ys+s2*zs
+     xd-xs=xo, yd-ys=yo, zd-zs=zo
+     then:
+     dx-sx+xo = 0
+     dy-sy+yo = 0
+     dz-sz+zo = 0
+     dx=sx-xo -> [max(0, -xo), max(0, min(sxr-xo, dxr))]
+     dy,dz compute the same
+     **/
+    
+    int offsetBias = dstOffset - srcOffset;
+    if (sizeNum == 0) {
+        // All stride is zero, then size will be all one
+        return offsetBias == 0;
+    }
+    int o[3] = {0, 0, 0};
+    int validIndex[3] = {0, 1, 2};
+    if (sizeNum == 2) {
+        if (stride[0] < stride[1]) {
+            validIndex[0] = 1;
+            validIndex[1] = 0;
+        }
+    } else if (sizeNum > 2) {
+        int maxs = stride[0];
+        int mins = stride[0];
+        int maxi = 0;
+        int mini = 0;
+        // Sort index by stride
+        for (int i=1; i<sizeNum; ++i) {
+            int s = stride[i];
+            if (s > maxs) {
+                maxs = s;
+                maxi = i;
+            }
+            if (s < mins) {
+                mins = s;
+                mini = i;
+            }
+        }
+        for (int i=0; i<sizeNum; ++i) {
+            if (i != maxi && i != mini) {
+                validIndex[1] = i;
+                break;
+            }
+        }
+        validIndex[0] = maxi;
+        validIndex[2] = mini;
+    }
+    // Compute offset
+    for (int i=0; i<sizeNum; ++i) {
+        int s = stride[validIndex[i]];
+        int xs = srcOffset / s;
+        int xd = dstOffset / s;
+        o[validIndex[i]] = xd-xs;
+        srcOffset = srcOffset % s;
+        dstOffset = dstOffset % s;
+    }
+    if (0 != srcOffset || 0 != dstOffset) {
+        return false;
+    }
+    for (int i=0; i<sizeNum; ++i) {
+        dstMin[i] = ALIMAX(0, -o[i]);
+        dstMax[i] = ALIMIN(srcSize[i]-o[i], dstSize[i]);
+    }
+    return true;
+}
 static bool _RegionValid(int* stride, int offset, int* size, int sizeNum, size_t limitSize) {
     int maxOffset = offset;
     int minOffset = offset;
@@ -489,211 +560,241 @@ static bool _RegionValid(int* stride, int offset, int* size, int sizeNum, size_t
     }
     return true;
 }
-
-// fuse srcRegion and dstRegion to dstRegion if return true
-bool TensorUtils::fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg) {
-    // src data isnot full data of dst
-    if (srcReg.dst.offset > dstReg.src.offset ||
-        srcReg.dst.stride[1] > srcReg.size[2] ||
-        srcReg.dst.stride[2] > srcReg.size[1] * srcReg.size[2]) {
-        return false;
+class TensorUtils::FuseRegionStatus {
+public:
+    enum Status {
+        FUSE_SRC_COPY,
+        FUSE_DST_COPY,
+        FUSE_REGION_COMPUTE
+    };
+    void apply(const Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg) {
+        switch (mStatus) {
+            case FUSE_SRC_COPY:
+                dstReg.origin = srcReg.origin;
+                dstReg.src.offset += srcReg.src.offset - srcReg.dst.offset;
+                break;
+            case FUSE_DST_COPY:
+                dstReg.origin = srcReg.origin;
+                dstReg.dst = srcReg.dst;
+                dstReg.src = srcReg.src;
+                dstReg.src.offset = mSrcOff;
+                dstReg.dst.offset = mDstOff;
+                dstReg.size[0] = srcReg.size[0];
+                dstReg.size[1] = srcReg.size[1];
+                dstReg.size[2] = srcReg.size[2];
+                break;
+            case FUSE_REGION_COMPUTE:
+            {
+                if (dstSize[0] == 0) {
+                    dstReg.size[0] = 0;
+                    dstReg.origin = nullptr;
+                    break;
+                }
+                for (int i=0; i<3; ++i) {
+                    dstReg.size[i] = 1;
+                    dstReg.src.stride[i] = 0;
+                    dstReg.dst.stride[i] = 0;
+                }
+                int valid[3] = {0, 0, 0};
+                int offset = 3 - dstNum;
+                if (dstNum > sizeNum) {
+                    for (int i = 2; i >= 0; i--) {
+                        if (i < dstNum) {
+                            if (dstSize[i] == 1) {
+                                expandIdx = i;
+                            }
+                            dstReg.size[i+offset] = dstMax[i] - dstMin[i];
+                            valid[i] = dstSize[i] > 1;
+                        } else {
+                            dstReg.size[i+offset] = 1;
+                            valid[i] = 0;
+                        }
+                    }
+                } else {
+                    for (int i=0; i<dstNum; ++i) {
+                        dstReg.size[i+offset] = dstMax[i] - dstMin[i];
+                        valid[i] = dstSize[i] > 1 ? 1 : 0;
+                    }
+                }
+                int idx = 0;
+                for (int i = 0; i < 3; i++) {
+                    if (valid[i] > 0 || i == expandIdx) {
+                        dstReg.src.stride[i+offset] = newSrc[idx];
+                        dstReg.dst.stride[i+offset] = dstDst[idx++];
+                    }
+                }
+                dstReg.origin = srcReg.origin;
+                dstReg.src.offset = newSrcOffset;
+                dstReg.dst.offset = newDstOffset;
+            }
+                break;
+            default:
+                break;
+        }
     }
-    int dstTotalSize = 1, srcTotalSize = 1;
-    for (int i = 0; i < 3; i++) {
-        if (dstReg.size[i] > 1) {
-            dstTotalSize *= dstReg.size[i];
+    bool match(const Tensor::InsideDescribe::Region& srcReg, const Tensor::InsideDescribe::Region& dstReg) {
+        // dont deal size > 1 && stride <= 0
+        for (int i = 0; i < 3; i++) {
+            if (srcReg.size[i] > 1 && (srcReg.src.stride[i] <= 0 || srcReg.dst.stride[i] <= 0)) {
+                return false;
+            }
+            if (dstReg.size[i] > 1 && (dstReg.src.stride[i] <= 0 || dstReg.dst.stride[i] <= 0)) {
+                return false;
+            }
         }
-        if (srcReg.size[i] > 1) {
-            srcTotalSize *= srcReg.size[i];
+        bool copyValid = true;
+        // src data isnot full data of dst
+        if (srcReg.dst.offset > dstReg.src.offset ||
+            srcReg.dst.stride[1] > srcReg.size[2] ||
+            srcReg.dst.stride[2] > srcReg.size[1] * srcReg.size[2]) {
+            copyValid = false;
+        }
+        int dstTotalSize = 1, srcTotalSize = 1;
+        int dstSrcMin = dstReg.src.offset;
+        int dstSrcMax = dstSrcMin;
+        int srcDstMin = srcReg.dst.offset;
+        int srcDstMax = srcDstMin;
+        for (int i = 0; i < 3; i++) {
+            srcDstMax += srcReg.dst.stride[i] * (srcReg.size[i] - 1);
+            dstSrcMax += dstReg.src.stride[i] * (dstReg.size[i] - 1);
+            if (dstReg.size[i] > 1) {
+                dstTotalSize *= dstReg.size[i];
+            }
+            if (srcReg.size[i] > 1) {
+                srcTotalSize *= srcReg.size[i];
+            }
         }
-    }
-    // src data is not full data of dst
-    if (dstTotalSize > srcTotalSize) {
-        return false;
-    }
-    // dont deal size > 1 && stride <= 0
-    for (int i = 0; i < 3; i++) {
-        if (srcReg.size[i] > 1 && (srcReg.src.stride[i] <= 0 || srcReg.dst.stride[i] <= 0)) {
-            return false;
+        // src data is not full data of dst
+        if (dstTotalSize > srcTotalSize) {
+            copyValid = false;
         }
-        if (dstReg.size[i] > 1 && (dstReg.src.stride[i] <= 0 || dstReg.dst.stride[i] <= 0)) {
-            return false;
+        // Valid range is from srcReg: srcDstMin - srcDstMax, if dst's srcReg exceed, not valid for copy
+        if (srcDstMin > dstSrcMin || srcDstMax < dstSrcMax) {
+            copyValid = false;
         }
-    }
-    // src copy fuse
-    if (isCopyRegion(srcReg)) {
-        dstReg.origin = srcReg.origin;
-        dstReg.src.offset += srcReg.src.offset - srcReg.dst.offset;
-        return true;
-    }
-    // dst copy fuse
-    if (isCopyRegion(dstReg) && dstTotalSize == srcTotalSize) {
-        int srcOff = dstReg.src.offset - srcReg.dst.offset;
-        int dstOff = dstReg.dst.offset;
-        srcOff = offsetCompute(srcReg, srcOff, true) + srcReg.src.offset;
-        if (srcReg.src.stride[2] > 0 && srcOff % srcReg.src.stride[2] != 0) {
-            // when transpose + slice, offset is not align can't fuse
-            return false;
+        // src copy fuse
+        if (isCopyRegion(srcReg) && copyValid) {
+            mStatus = FUSE_SRC_COPY;
+            return true;
         }
-        dstReg.origin = srcReg.origin;
-        dstReg.dst = srcReg.dst;
-        dstReg.src = srcReg.src;
-        dstReg.src.offset = srcOff;
-        dstReg.dst.offset = dstOff;
-        dstReg.size[0] = srcReg.size[0];
-        dstReg.size[1] = srcReg.size[1];
-        dstReg.size[2] = srcReg.size[2];
-        return true;
-    }
-#define MNN_FAST_FUSE_WITHOUT_STL
-#ifdef MNN_FAST_FUSE_WITHOUT_STL
-    // general fuse
-    int srcDst[3], srcSrc[3], dstSrc[3], dstDst[3], srcSize[3], dstSize[3], newSrc[3], dstStride[3], srcStride[3];
-#define MNN_3_INT_INIT(x, y) { x[0] = y; x[1] = y; x[2] = y; }
-    MNN_3_INT_INIT(dstStride, -1)
-    MNN_3_INT_INIT(srcStride, -1)
-#undef MNN_3_INT_INIT
-    int srcNum = 0, dstNum = 0, sizeNum = 0;
-    for (int i = 0; i < 3; i++) {
-        if (srcReg.size[i] > 1) {
-            srcStride[srcNum] = srcReg.dst.stride[i];
-            srcDst[srcNum]    = srcReg.dst.stride[i];
-            srcSrc[srcNum]    = srcReg.src.stride[i];
-            srcSize[srcNum]   = srcReg.size[i];
-            srcNum++;
-        }
-        if (dstReg.size[i] > 1) {
-            dstStride[dstNum] = dstReg.src.stride[i];
-            dstDst[dstNum]    = dstReg.dst.stride[i];
-            dstSrc[dstNum]    = dstReg.src.stride[i];
-            dstSize[dstNum]   = dstReg.size[i];
-            dstNum++;
-        }
-    }
-    sizeNum = dstNum;
-#define MNN_3_INT_DIFF(r, x, y, i) if ((x[i] != y[0]) && (x[i] != y[1]) && (x[i] != y[2])) { if (r > 0) { return false; } else { r = x[i]; } }
-    int srcExtra = -1, dstExtra = -1;
-    MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 0)
-    MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 1)
-    MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 2)
-    MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 0)
-    MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 1)
-    MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 2)
-#undef MNN_3_INT_DIFF
-    if (dstExtra > 0) {
-        if (!expandStrideSize(srcDst, srcSrc, srcSize, srcNum, dstExtra)) {
-            return false;
+        // dst copy fuse
+        if (isCopyRegion(dstReg) && dstTotalSize == srcTotalSize && copyValid) {
+            mSrcOff = dstReg.src.offset - srcReg.dst.offset;
+            mDstOff = dstReg.dst.offset;
+            mSrcOff = offsetCompute(srcReg, dstReg.src.offset, srcReg.dst.offset, true) + srcReg.src.offset;
+            if (!(srcReg.src.stride[2] > 0 && mSrcOff % srcReg.src.stride[2] != 0)) {
+                // when transpose + slice, offset is not align can't fuse
+                mStatus = FUSE_DST_COPY;
+                return true;
+            }
         }
-    }
-    if (srcExtra > 0) {
-        if (!expandStrideSize(dstSrc, dstDst, dstSize, dstNum, srcExtra)) {
-            return false;
+    #define MNN_3_INT_INIT(x, y) { x[0] = y; x[1] = y; x[2] = y; }
+        MNN_3_INT_INIT(dstStride, -1)
+        MNN_3_INT_INIT(srcStride, -1)
+        expandIdx = -1;
+    #undef MNN_3_INT_INIT
+        srcNum = 0, dstNum = 0, sizeNum = 0;
+        for (int i = 0; i < 3; i++) {
+            if (srcReg.size[i] > 1) {
+                srcStride[srcNum] = srcReg.dst.stride[i];
+                srcDst[srcNum]    = srcReg.dst.stride[i];
+                srcSrc[srcNum]    = srcReg.src.stride[i];
+                srcSize[srcNum]   = srcReg.size[i];
+                srcNum++;
+            }
+            if (dstReg.size[i] > 1) {
+                dstStride[dstNum] = dstReg.src.stride[i];
+                dstDst[dstNum]    = dstReg.dst.stride[i];
+                dstSrc[dstNum]    = dstReg.src.stride[i];
+                dstSize[dstNum]   = dstReg.size[i];
+                dstNum++;
+            }
         }
-    }
-    // reorder srcSrc to newSrc by align srcDst and dstSrc
-    for (int i = 0; i < dstNum; i++) {
-        int index = 0;
-        for (int j = 0; j < srcNum; j++) {
-            if (dstSrc[j] == srcDst[i]) {
-                index = j;
+        sizeNum = dstNum;
+    #define MNN_3_INT_DIFF(r, x, y, i) if ((x[i] != y[0]) && (x[i] != y[1]) && (x[i] != y[2])) { if (r > 0) { return false; } else { r = x[i]; } }
+        int srcExtra = -1, dstExtra = -1;
+        MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 0)
+        MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 1)
+        MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 2)
+        MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 0)
+        MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 1)
+        MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 2)
+    #undef MNN_3_INT_DIFF
+        if (dstExtra > 0) {
+            if (!expandStrideSize(srcDst, srcSrc, srcSize, srcNum, dstExtra)) {
+                return false;
             }
         }
-        newSrc[index] = srcSrc[i];
-    }
-    // set final size and set expandIdx if expand val is 1
-    int expandIdx = -1;
-    int newSrcOffset = offsetCompute(srcReg, dstReg.src.offset - srcReg.dst.offset, true) + srcReg.src.offset;
-    if (nullptr != srcReg.origin) {
-        bool valid = _RegionValid(newSrc, newSrcOffset, dstSize, dstNum, TensorUtils::getRawSize(srcReg.origin));
-        if (!valid) {
-            // Exceed src range
-            return false;
+        if (srcExtra > 0) {
+            if (!expandStrideSize(dstSrc, dstDst, dstSize, dstNum, srcExtra)) {
+                return false;
+            }
         }
-    }
-    if (dstNum > sizeNum) {
-        for (int i = 2; i >= 0; i--) {
-            if (i < dstNum) {
-                if (dstSize[i] == 1) {
-                    expandIdx = i;
+        // reorder srcSrc to newSrc by align srcDst and dstSrc
+        for (int i = 0; i < srcNum; i++) {
+            int index = -1;
+            for (int j = 0; j < dstNum; j++) {
+                if (dstSrc[j] == srcDst[i]) {
+                    index = j;
+                    break;
                 }
-                dstReg.size[i] = dstSize[i];
-            } else {
-                dstReg.size[i] = 1;
             }
+            if (-1 == index) {
+                return false;
+            }
+            newSrc[index] = srcSrc[i];
+            newSrcSize[index] = srcSize[i];
         }
-    }
-#else
-    // general fuse
-    std::set<int> dstStride, srcStride, dstDiff, srcDiff;
-    std::vector<int> dstDst, dstSrc, srcDst, srcSrc, newSrc, dstSize, srcSize;
-    for (int i = 0; i < 3; i++) {
-        if (srcReg.size[i] > 1) {
-            srcStride.insert(srcReg.dst.stride[i]);
-            srcDst.push_back(srcReg.dst.stride[i]);
-            srcSrc.push_back(srcReg.src.stride[i]);
-            srcSize.push_back(srcReg.size[i]);
-        }
-        if (dstReg.size[i] > 1) {
-            dstStride.insert(dstReg.src.stride[i]);
-            dstDst.push_back(dstReg.dst.stride[i]);
-            dstSrc.push_back(dstReg.src.stride[i]);
-            dstSize.push_back(dstReg.size[i]);
-        }
-    }
-    int sizeNum = dstSize.size();
-    std::set_difference(dstStride.begin(), dstStride.end(), srcStride.begin(), srcStride.end(), std::inserter(dstDiff, dstDiff.begin()));
-    std::set_difference(srcStride.begin(), srcStride.end(), dstStride.begin(), dstStride.end(), std::inserter(srcDiff, srcDiff.begin()));
-    if (dstDiff.size() > 1 || srcDiff.size() > 1) {
-        // many diff stride, now dont deal
-        return false;
-    }
-    // expand stride when middle tensor's stride diff
-    if (!dstDiff.empty()) {
-        if (!expandSrc(srcDst, srcSrc, srcSize, *dstDiff.begin())) {
-            return false;
-        }
-    }
-    if (!srcDiff.empty()) {
-        if (!expandSrc(dstSrc, dstDst, dstSize, *srcDiff.begin())) {
+        // set final size and set expandIdx if expand val is 1
+        newSrcOffset = offsetCompute(srcReg, dstReg.src.offset, srcReg.dst.offset, true) + srcReg.src.offset;
+        bool valid = _ClipDst(dstSrc, srcReg.dst.offset, dstReg.src.offset, newSrcSize, dstSize, dstNum, dstMax, dstMin);
+        if (!valid) {
             return false;
         }
-    }
-    if (dstSize.size() > 3) {
-        // need splite region, dont deal
-        return false;
-    }
-    // reorder srcSrc to newSrc by align srcDst and dstSrc
-    newSrc.resize(srcSrc.size());
-    for (int i = 0; i < dstSrc.size(); i++) {
-        int index = std::distance(dstSrc.begin(), std::find(dstSrc.begin(), dstSrc.end(), srcDst[i]));
-        newSrc[index] = srcSrc[i];
-    }
-    // set final size and set expandIdx if expand val is 1
-    int expandIdx = -1;
-    if (dstSize.size() > sizeNum) {
-        for (int i = 2; i >= 0; i--) {
-            if (i < dstSize.size()) {
-                if (dstSize[i] == 1) {
-                    expandIdx = i;
-                }
-                dstReg.size[i] = dstSize[i];
-            } else {
-                dstReg.size[i] = 1;
+        newDstOffset = dstReg.dst.offset;
+        for (int i=0; i<dstNum; ++i) {
+            if (dstMax[i] <= dstMin[i]) {
+                // Set region as empty
+                dstSize[0] = 0;
+                dstSize[1] = 0;
+                dstSize[2] = 0;
+                break;
+            }
+            if (dstMin[i] > 0) {
+                newDstOffset += dstMin[i] * dstDst[i];
+                newSrcOffset += dstMin[i] * newSrc[i];
             }
         }
+        mStatus = FUSE_REGION_COMPUTE;
+        return true;
     }
-#endif
-    int idx = 0;
-    for (int i = 0; i < 3; i++) {
-        if (dstReg.size[i] > 1 || i == expandIdx) {
-            dstReg.src.stride[i] = newSrc[idx];
-            dstReg.dst.stride[i] = dstDst[idx++];
-        }
-    }
-    dstReg.origin = srcReg.origin;
-    dstReg.src.offset = newSrcOffset;
-    return true;
+private:
+    int mStatus;
+    int mSrcOff;
+    int mDstOff;
+    // general fuse
+    int srcDst[3], srcSrc[3], dstSrc[3], dstDst[3], srcSize[3], dstSize[3], newSrc[3], dstStride[3], srcStride[3];
+    int dstMin[3],dstMax[3];
+    int newSrcSize[3];
+    int srcNum, dstNum, sizeNum;
+    int newSrcOffset;
+    int newDstOffset;
+    int expandIdx;
+};
+
+TensorUtils::FuseWrap::FuseWrap() {
+    mStatus = new FuseRegionStatus;
+}
+TensorUtils::FuseWrap::~ FuseWrap() {
+    delete mStatus;
+}
+bool TensorUtils::FuseWrap::match(const Tensor::InsideDescribe::Region& srcReg, const Tensor::InsideDescribe::Region& dstReg) {
+    return mStatus->match(srcReg, dstReg);
 }
+void TensorUtils::FuseWrap::apply(const Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg) {
+    mStatus->apply(srcReg, dstReg);
+}
+
 void TensorUtils::adjustTensorForCompability(Tensor* newTensor) {
     if (newTensor->dimensions() < 4) {
         for (int n = newTensor->dimensions(); n < 4; ++n) {
diff --git a/source/core/TensorUtils.hpp b/source/core/TensorUtils.hpp
index d8f8498ec..d98354597 100644
--- a/source/core/TensorUtils.hpp
+++ b/source/core/TensorUtils.hpp
@@ -187,7 +187,17 @@ class MNN_PUBLIC TensorUtils {
     static bool isTileRegion(const Tensor::InsideDescribe::Region& region);
     static bool isDepthToSpaceRegions(const Tensor* output);
     static bool reshapeSlice(Tensor::InsideDescribe::Region& slice, int outside, int inside, int axis);
-    static bool fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg);
+    
+    class FuseRegionStatus;
+    class FuseWrap {
+    public:
+        FuseWrap();
+        ~ FuseWrap();
+        bool match(const Tensor::InsideDescribe::Region& srcReg, const Tensor::InsideDescribe::Region& dstReg);
+        void apply(const Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg);
+    private:
+        FuseRegionStatus* mStatus;
+    };
     static void adjustTensorForCompability(Tensor* t);
     static Tensor::DimensionType getDimType(const Tensor* t);
     static std::vector<float> getQuantInfo(const Tensor* t);
diff --git a/source/geometry/GeometryBinary.cpp b/source/geometry/GeometryBinary.cpp
index 81d8b0869..1e6610c11 100644
--- a/source/geometry/GeometryBinary.cpp
+++ b/source/geometry/GeometryBinary.cpp
@@ -122,93 +122,92 @@ class GeometryBinary : public GeometryComputer {
             input1Broadcast = true;
         }
 #ifdef MNN_BINARY_LOOP_OPT
-        if (input0Broadcast || input1Broadcast) {
-            if (inp0format == outFormat && inp1format == outFormat && outFormat != MNN_DATA_FORMAT_NC4HW4 && input0->getType().code == halide_type_float && op->main_as_BinaryOp()->activationType() == 0) {
-                if (!(input0Broadcast && input1Broadcast)) {
-//                if (false) {
-                    // Use Loop instead of broadcast
-                    std::shared_ptr<Tensor> newTensor(new Tensor);
-                    TensorUtils::copyShape(output, newTensor.get(), true);
-                    newTensor->buffer().type = output->buffer().type;
-                    int srcIndex = 1;
-                    int dstIndex = 2;
-                    if (input0Broadcast) {
-                        ConvertUtils::broadcastto(input0, newTensor.get());
-                    } else {
-                        srcIndex = 2;
-                        dstIndex = 1;
-                        ConvertUtils::broadcastto(input1, newTensor.get());
-                    }
-                    auto des = TensorUtils::getDescribe(newTensor.get());
-                    flatbuffers::FlatBufferBuilder builder;
-                    BinaryOpBuilder binaryOpParamBuilder(builder);
-                    binaryOpParamBuilder.add_opType(op->main_as_BinaryOp()->opType());
-                    auto binaryOpParamOffset = binaryOpParamBuilder.Finish();
-                    OpBuilder cmdOpBuilder(builder);
-                    cmdOpBuilder.add_type(OpType_BinaryOp);
-                    cmdOpBuilder.add_main(binaryOpParamOffset.Union());
-                    cmdOpBuilder.add_main_type(OpParameter_BinaryOp);
-                    auto cmdOpOffset = cmdOpBuilder.Finish();
-                    auto iterIndexesOffset = builder.CreateVector(std::vector<int>{-1, -1, -1});
-                    auto stepOffset = builder.CreateVector(std::vector<int>{0, 0, 0});
-                    auto indexesOffset = builder.CreateVector(std::vector<int>{2, 0, 1});
-                    std::vector<flatbuffers::Offset<RegionCommand>> regionCommands;
+        // One input need broadcast, the other needn't
+        bool singleBroadCast = (!(input0Broadcast && input1Broadcast)) && (input0Broadcast || input1Broadcast);
+        bool forwardSupportLoop = inp0format == outFormat && inp1format == outFormat && outFormat != MNN_DATA_FORMAT_NC4HW4 && input0->getType().code == halide_type_float && op->main_as_BinaryOp()->activationType() == 0;
+        bool openLoop = context.support(Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_USELOOP);
+        if (singleBroadCast && forwardSupportLoop && openLoop) {
+            // Use Loop instead of broadcast
+            std::shared_ptr<Tensor> newTensor(new Tensor);
+            TensorUtils::copyShape(output, newTensor.get(), true);
+            newTensor->buffer().type = output->buffer().type;
+            int srcIndex = 1;
+            int dstIndex = 2;
+            if (input0Broadcast) {
+                ConvertUtils::broadcastto(input0, newTensor.get());
+            } else {
+                srcIndex = 2;
+                dstIndex = 1;
+                ConvertUtils::broadcastto(input1, newTensor.get());
+            }
+            auto des = TensorUtils::getDescribe(newTensor.get());
+            flatbuffers::FlatBufferBuilder builder;
+            BinaryOpBuilder binaryOpParamBuilder(builder);
+            binaryOpParamBuilder.add_opType(op->main_as_BinaryOp()->opType());
+            auto binaryOpParamOffset = binaryOpParamBuilder.Finish();
+            OpBuilder cmdOpBuilder(builder);
+            cmdOpBuilder.add_type(OpType_BinaryOp);
+            cmdOpBuilder.add_main(binaryOpParamOffset.Union());
+            cmdOpBuilder.add_main_type(OpParameter_BinaryOp);
+            auto cmdOpOffset = cmdOpBuilder.Finish();
+            auto iterIndexesOffset = builder.CreateVector(std::vector<int>{-1, -1, -1});
+            auto stepOffset = builder.CreateVector(std::vector<int>{0, 0, 0});
+            auto indexesOffset = builder.CreateVector(std::vector<int>{2, 0, 1});
+            std::vector<flatbuffers::Offset<RegionCommand>> regionCommands;
 
-                    for (int i=0; i<des->regions.size(); ++i) {
-                        auto& reg = des->regions[i];
-                        auto sizeOffset = builder.CreateVector(reg.size, 3);
-                        auto dstStride = builder.CreateVector(reg.dst.stride, 3);
-                        auto srcStride = builder.CreateVector(reg.src.stride, 3);
-                        std::vector<flatbuffers::Offset<View>> views(3);
-                        {
-                            ViewBuilder dstBuilder(builder);
-                            dstBuilder.add_offset(reg.dst.offset);
-                            dstBuilder.add_stride(dstStride);
-                            views[0] = dstBuilder.Finish();
-                            views[dstIndex] = views[0];
-                            ViewBuilder srcBuilder(builder);
-                            srcBuilder.add_offset(reg.src.offset);
-                            srcBuilder.add_stride(srcStride);
-                            views[srcIndex] = srcBuilder.Finish();
-                        }
-                        auto viewsOffset = builder.CreateVector<flatbuffers::Offset<View>>(views);
-                        RegionCommandBuilder cmdBuilder(builder);
-                        cmdBuilder.add_op(cmdOpOffset);
-                        cmdBuilder.add_view(viewsOffset);
-                        cmdBuilder.add_size(sizeOffset);
-                        cmdBuilder.add_steps(stepOffset);
-                        cmdBuilder.add_iterIndexes(iterIndexesOffset);
-                        cmdBuilder.add_indexes(indexesOffset);
-                        
-                        regionCommands.emplace_back(cmdBuilder.Finish());
-                    }
-                    auto rcmdAllOffset = builder.CreateVector<flatbuffers::Offset<RegionCommand>>(regionCommands);
-                    auto inputIndexesOffset = builder.CreateVector(std::vector<int>{0, 1});
-                    auto outputIndexesOffset = builder.CreateVector(std::vector<int>{2});
-                    LoopParamBuilder loopBuilder(builder);
-                    loopBuilder.add_commands(rcmdAllOffset);
-                    loopBuilder.add_loopNumber(1);
-                    loopBuilder.add_tensorNumber(3);
-                    loopBuilder.add_inputIndexes(inputIndexesOffset);
-                    loopBuilder.add_outputIndexes(outputIndexesOffset);
-                    auto loopOffset = loopBuilder.Finish();
-                    flatbuffers::Offset<flatbuffers::String> nameOffset;
-                    if (nullptr != op->name()) {
-                        nameOffset = builder.CreateString(op->name()->c_str());
-                    }
-                    OpBuilder finishBuilder(builder);
-                    finishBuilder.add_main(loopOffset.Union());
-                    finishBuilder.add_main_type(OpParameter_LoopParam);
-                    finishBuilder.add_type(OpType_While);
-                    if (nullptr != op->name()) {
-                        finishBuilder.add_name(nameOffset);
-                    }
-                    builder.Finish(finishBuilder.Finish());
-                    auto cmd = GeometryComputerUtils::makeCommand(builder, {input0, input1}, outputs);
-                    res.command.emplace_back(std::move(cmd));
-                    return true;
+            for (int i=0; i<des->regions.size(); ++i) {
+                auto& reg = des->regions[i];
+                auto sizeOffset = builder.CreateVector(reg.size, 3);
+                auto dstStride = builder.CreateVector(reg.dst.stride, 3);
+                auto srcStride = builder.CreateVector(reg.src.stride, 3);
+                std::vector<flatbuffers::Offset<View>> views(3);
+                {
+                    ViewBuilder dstBuilder(builder);
+                    dstBuilder.add_offset(reg.dst.offset);
+                    dstBuilder.add_stride(dstStride);
+                    views[0] = dstBuilder.Finish();
+                    views[dstIndex] = views[0];
+                    ViewBuilder srcBuilder(builder);
+                    srcBuilder.add_offset(reg.src.offset);
+                    srcBuilder.add_stride(srcStride);
+                    views[srcIndex] = srcBuilder.Finish();
                 }
+                auto viewsOffset = builder.CreateVector<flatbuffers::Offset<View>>(views);
+                RegionCommandBuilder cmdBuilder(builder);
+                cmdBuilder.add_op(cmdOpOffset);
+                cmdBuilder.add_view(viewsOffset);
+                cmdBuilder.add_size(sizeOffset);
+                cmdBuilder.add_steps(stepOffset);
+                cmdBuilder.add_iterIndexes(iterIndexesOffset);
+                cmdBuilder.add_indexes(indexesOffset);
+                
+                regionCommands.emplace_back(cmdBuilder.Finish());
+            }
+            auto rcmdAllOffset = builder.CreateVector<flatbuffers::Offset<RegionCommand>>(regionCommands);
+            auto inputIndexesOffset = builder.CreateVector(std::vector<int>{0, 1});
+            auto outputIndexesOffset = builder.CreateVector(std::vector<int>{2});
+            LoopParamBuilder loopBuilder(builder);
+            loopBuilder.add_commands(rcmdAllOffset);
+            loopBuilder.add_loopNumber(1);
+            loopBuilder.add_tensorNumber(3);
+            loopBuilder.add_inputIndexes(inputIndexesOffset);
+            loopBuilder.add_outputIndexes(outputIndexesOffset);
+            auto loopOffset = loopBuilder.Finish();
+            flatbuffers::Offset<flatbuffers::String> nameOffset;
+            if (nullptr != op->name()) {
+                nameOffset = builder.CreateString(op->name()->c_str());
             }
+            OpBuilder finishBuilder(builder);
+            finishBuilder.add_main(loopOffset.Union());
+            finishBuilder.add_main_type(OpParameter_LoopParam);
+            finishBuilder.add_type(OpType_While);
+            if (nullptr != op->name()) {
+                finishBuilder.add_name(nameOffset);
+            }
+            builder.Finish(finishBuilder.Finish());
+            auto cmd = GeometryComputerUtils::makeCommand(builder, {input0, input1}, outputs);
+            res.command.emplace_back(std::move(cmd));
+            return true;
         }
 #endif
         if (input0Broadcast) {
diff --git a/source/geometry/GeometryComputer.cpp b/source/geometry/GeometryComputer.cpp
index 29a34cade..7c9887d30 100644
--- a/source/geometry/GeometryComputer.cpp
+++ b/source/geometry/GeometryComputer.cpp
@@ -7,6 +7,7 @@
 //
 
 #include <mutex>
+#include <MNN/Interpreter.hpp>
 #include "geometry/GeometryComputer.hpp"
 #include "core/Backend.hpp"
 #include "core/OpCommonUtils.hpp"
@@ -18,7 +19,7 @@ namespace MNN {
 GeometryComputer::Context::~Context() {
     // Do nothing
 }
-GeometryComputer::Context::Context(std::shared_ptr<Backend> allocBackend, MNNForwardType type, BackendConfig::PrecisionMode precision) {
+GeometryComputer::Context::Context(int mask, std::shared_ptr<Backend> allocBackend, MNNForwardType type, BackendConfig::PrecisionMode precision) : mMask(mask) {
     mBackend       = allocBackend;
     flatbuffers::FlatBufferBuilder builder(32);
     OpBuilder opBuilder(builder);
@@ -287,20 +288,83 @@ void GeometryComputer::Context::getRasterCacheCreateRecursive(Tensor* src, Comma
     if (_hasZeroDim(src)) {
         return;
     }
-    for (auto& input : srcDes->regions) {
-        MNN_ASSERT(input.origin != src);
-        auto inputDes = TensorUtils::getDescribe(input.origin);
-        while (_virtualMemory(inputDes)) {
-            if (1 != inputDes->regions.size()) {
+    bool needDelete = false;
+    bool supportFuse = support(Interpreter::GEOMETRCOMPUTEMASK_FUSEREGION);
+    bool supportFuseMulti = support(Interpreter::GEOMETRCOMPUTEMASK_FUSEREGION_MULTI);
+    for (int regIndex = 0; regIndex < srcDes->regions.size();) {
+        auto input = srcDes->regions.data() + regIndex;
+        MNN_ASSERT(input->origin != src);
+        
+        auto inputDes = TensorUtils::getDescribe(input->origin);
+        while (_virtualMemory(inputDes) && supportFuse) {
+            if (0 == inputDes->regions.size()) {
+                // Empty Input, Remove the region by set size as 0
+                input->size[0] = 0;
+                needDelete = true;
                 break;
             }
-            bool merge = TensorUtils::fuseRegion(inputDes->regions[0], input);
-            if (!merge) {
+            if (1 < inputDes->regions.size()) {
+                if (!supportFuseMulti) {
+                    break;
+                }
+                bool allCanMerge = true;
+                for (auto& reg : inputDes->regions) {
+                    allCanMerge = allCanMerge && mFuseUtils.match(reg, *input);
+                    if (!allCanMerge) {
+                        break;
+                    }
+                }
+                if (!allCanMerge) {
+                    break;
+                }
+                Tensor::InsideDescribe::Region backup = *input;
+                mFuseUtils.match(inputDes->regions[0], *input);
+                mFuseUtils.apply(inputDes->regions[0], *input);
+                for (int i=1; i<inputDes->regions.size(); ++i) {
+                    auto newReg = backup;
+                    mFuseUtils.match(inputDes->regions[i], newReg);
+                    mFuseUtils.apply(inputDes->regions[i], newReg);
+                    if (newReg.size[0] == 0) {
+                        continue;
+                    }
+                    srcDes->regions.emplace_back(newReg);
+                }
+                // After emplace_back, the input will change, reref it
+                input = srcDes->regions.data() + regIndex;
+                if (input->size[0] == 0) {
+                    needDelete = true;
+                    break;
+                }
+                inputDes = TensorUtils::getDescribe(input->origin);
+                continue;
+            }
+            bool merge = mFuseUtils.match(inputDes->regions[0], *input);
+            if (merge) {
+                mFuseUtils.apply(inputDes->regions[0], *input);
+            } else {
                 break;
             }
-            inputDes = TensorUtils::getDescribe(input.origin);
+            if (input->size[0] == 0) {
+                needDelete = true;
+                break;
+            }
+            inputDes = TensorUtils::getDescribe(input->origin);
+        }
+        if (input->size[0] > 0) {
+            getRasterCacheCreateRecursive(input->origin, cmd);
+        }
+        ++regIndex;
+    }
+    if (needDelete) {
+        auto regions = std::move(srcDes->regions);
+        srcDes->regions.reserve(regions.size());
+        for (int regIndex = 0; regIndex < regions.size(); ++regIndex) {
+            auto input = std::move(regions[regIndex]);
+            if (input.size[0] == 0 || input.size[1] == 0 || input.size[2] == 0) {
+                continue;
+            }
+            srcDes->regions.emplace_back(std::move(input));
         }
-        getRasterCacheCreateRecursive(input.origin, cmd);
     }
     getRasterCacheCreate(src, cmd);
 }
diff --git a/source/geometry/GeometryComputer.hpp b/source/geometry/GeometryComputer.hpp
index a049be684..f826e4d99 100644
--- a/source/geometry/GeometryComputer.hpp
+++ b/source/geometry/GeometryComputer.hpp
@@ -23,7 +23,7 @@ class GeometryComputer {
     }
     class MNN_PUBLIC Context {
     public:
-        Context(std::shared_ptr<Backend> allocBackend, MNNForwardType type = MNN_FORWARD_CPU, BackendConfig::PrecisionMode precision = BackendConfig::Precision_Normal);
+        Context(int mask, std::shared_ptr<Backend> allocBackend, MNNForwardType type = MNN_FORWARD_CPU, BackendConfig::PrecisionMode precision = BackendConfig::Precision_Normal);
         ~Context();
 
         void clear();
@@ -41,6 +41,9 @@ class GeometryComputer {
         inline BackendConfig::PrecisionMode precisionType() const {
             return mPrecision;
         }
+        inline bool support(int option) const {
+            return mMask & option;
+        }
         std::shared_ptr<BufferStorage> mRasterOp;
     private:
         void getRasterCacheCreate(Tensor* src, CommandBuffer& cmd);
@@ -50,6 +53,8 @@ class GeometryComputer {
         std::shared_ptr<Backend> mBackend;
         MNNForwardType mForwardType;
         BackendConfig::PrecisionMode mPrecision;
+        TensorUtils::FuseWrap mFuseUtils;
+        const int mMask;
     };
     static void init();
     MNN_PUBLIC static const GeometryComputer* search(int opType, Runtime::CompilerType compType);
diff --git a/source/geometry/GeometryComputerUtils.cpp b/source/geometry/GeometryComputerUtils.cpp
index 54db0c1d0..fc76622ab 100644
--- a/source/geometry/GeometryComputerUtils.cpp
+++ b/source/geometry/GeometryComputerUtils.cpp
@@ -147,8 +147,9 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
     Runtime::CompilerType compileType, 
     bool skipShapeCompute,
     bool permitCodegen) {
+    bool openCache = geoContext.support(Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_OPENCACHE);
     /** Size Compute and compute Const Begin */
-    GeometryComputer::Context ctx(backupBackend);
+    GeometryComputer::Context ctx(Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_ALL, backupBackend);
     // Size Compute and compute Const
     for (int i=0; i<infos.size(); ++i) {
         auto& info = infos[i];
@@ -238,7 +239,10 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
             ctx.clear();
             auto geo = GeometryComputer::search(info.op->type(), Runtime::Compiler_Loop);
             {
-                auto res = geo->onRecompute(info.op, info.inputs, info.outputs, geoContext, tempBuffer);
+                bool res = false;
+                if (openCache) {
+                    res = geo->onRecompute(info.op, info.inputs, info.outputs, geoContext, tempBuffer);
+                }
                 if (!res) {
                     tempBuffer.command.clear();
                     tempBuffer.extras.clear();
@@ -350,7 +354,7 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
         auto geo = GeometryComputer::search(info.op->type(), compileType);
         {
             bool res = false;
-            if (!tempBuffer.hasWrap) {
+            if ((!tempBuffer.hasWrap) && openCache) {
                 res = geo->onRecompute(info.op, info.inputs, info.outputs, geoContext, tempBuffer);
             }
             if (!res) {
diff --git a/test.sh b/test.sh
index 0d0e90e1b..9d7c1b2d7 100755
--- a/test.sh
+++ b/test.sh
@@ -294,7 +294,7 @@ model_test() {
         echo '### 静态模型测试失败，测试终止！'
         failed
     fi
-    
+
     if [ "$OPENCL_CHANGE" ]; then
         ../tools/script/modelTest.py ~/AliNNModel 3 0.002 1
         if [ $? -ne 0 ]; then
@@ -431,7 +431,7 @@ opencv_test() {
 
 llm_test() {
     # 1. build llm with low memory
-    cmake -DMNN_OPENCV_TEST=ON -DMNN_BUILD_LLM=ON ..
+    cmake -DMNN_LOW_MEMORY=ON -DMNN_BUILD_LLM=ON -DMNN_SUPPORT_TRANSFORMER_FUSE=ON ..
     make -j8
     llm_build_wrong=$[$? > 0]
     printf "TEST_NAME_LLM_BUILD: LLM编译测试\nTEST_CASE_AMOUNT_LLM_BUILD: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n" \
@@ -441,7 +441,7 @@ llm_test() {
         failed
     fi
     # 2. run llm model test
-    ./llm_demo ~/AliNNModel/qwen-1.8b-int4 0 10 ~/AliNNModel/qwen-1.8b-int4/prompt.txt
+    ./llm_demo ~/AliNNModel/qwen1.5-0.5b-int4/config.json ~/AliNNModel/qwen1.5-0.5b-int4/prompt.txt
     if [ $? -gt 0 ]; then
         echo '### LLM模型测试失败，测试终止！'
         failed
@@ -543,7 +543,7 @@ android_model_test() {
             fi
         fi
     done
-    
+
     models=`ls ~/AliNNModel/TestResource/`
     for model in $models
     do
@@ -562,7 +562,7 @@ android_model_test() {
             fi
         fi
     done
-    
+
     models=`ls ~/AliNNModel/TestWithDescribe/`
     for model in $models
     do
diff --git a/test/CommonOpCreator.hpp b/test/CommonOpCreator.hpp
index 1097589cd..9efbb51a5 100644
--- a/test/CommonOpCreator.hpp
+++ b/test/CommonOpCreator.hpp
@@ -24,7 +24,7 @@ static PadMode _convertPadMode(Express::PaddingMode mode) {
     }
     return PadMode_CAFFE;
 }
-static Express::VARP _HybridConv(const std::vector<float>& weight, std::vector<float>&& bias, std::vector<float>&& alpha, Express::VARP x, std::vector<int> channel, std::vector<int> kernelSize,
+static Express::VARP _HybridConv(const std::vector<float>& weight, const std::vector<float>& bias, const std::vector<float>& alpha, Express::VARP x, std::vector<int> channel, std::vector<int> kernelSize,
                           Express::PaddingMode pad, std::vector<int> stride, std::vector<int> dilate, int group, std::vector<int> pads, bool relu, bool relu6, int nbits, bool async) {
     std::unique_ptr<OpT> convOp(new OpT);
     convOp->type = OpType_Convolution;
@@ -56,7 +56,7 @@ static Express::VARP _HybridConv(const std::vector<float>& weight, std::vector<f
     conv2D->common->relu = relu;
     conv2D->weight.clear();
     MNN_ASSERT(bias.size() == channel[1]);
-    conv2D->bias = std::move(bias);
+    conv2D->bias = bias;
     return (Express::Variable::create(Express::Expr::create(convOp.get(), {x})));
 }
 
diff --git a/test/MNNTestSuite.cpp b/test/MNNTestSuite.cpp
index fa039b526..157a54051 100644
--- a/test/MNNTestSuite.cpp
+++ b/test/MNNTestSuite.cpp
@@ -6,9 +6,10 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
-#include "MNNTestSuite.h"
 #include <stdlib.h>
-
+#include <map>
+#include <MNN/AutoTime.hpp>
+#include "MNNTestSuite.h"
 MNNTestSuite* MNNTestSuite::gInstance = NULL;
 
 MNNTestSuite* MNNTestSuite::get() {
@@ -30,14 +31,14 @@ void MNNTestSuite::add(MNNTestCase* test, const char* name) {
 }
 
 static void printTestResult(int wrong, int right, const char* flag) {
-    printf("TEST_NAME_UNIT%s: 单元测试%s\nTEST_CASE_AMOUNT_UNIT%s: ", flag, flag, flag);
-    printf("{\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n", wrong, right);
+    MNN_PRINT("TEST_NAME_UNIT%s: 单元测试%s\nTEST_CASE_AMOUNT_UNIT%s: ", flag, flag, flag);
+    MNN_PRINT("{\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n", wrong, right);
 }
 
 int MNNTestSuite::run(const char* key, int precision, const char* flag) {
     if (key == NULL || strlen(key) == 0)
         return 0;
-
+    std::map<std::string, float> runTimes;
     auto suite         = MNNTestSuite::get();
     std::string prefix = key;
     std::vector<std::string> wrongs;
@@ -46,26 +47,32 @@ int MNNTestSuite::run(const char* key, int precision, const char* flag) {
         MNNTestCase* test = suite->mTests[i];
         if (test->name.find(prefix) == 0) {
             runUnit++;
-            printf("\trunning %s.\n", test->name.c_str());
+            MNN_PRINT("\trunning %s.\n", test->name.c_str());
+            MNN::Timer _t;
             auto res = test->run(precision);
+            runTimes.insert(std::make_pair(test->name, _t.durationInUs() / 1000.0f));
             if (!res) {
                 wrongs.emplace_back(test->name);
             }
         }
     }
     if (wrongs.empty()) {
-        printf("√√√ all <%s> tests passed.\n", key);
+        MNN_PRINT("√√√ all <%s> tests passed.\n", key);
     }
     for (auto& wrong : wrongs) {
-        printf("Error: %s\n", wrong.c_str());
+        MNN_PRINT("Error: %s\n", wrong.c_str());
     }
     printTestResult(wrongs.size(), runUnit - wrongs.size(), flag);
+    for (auto& iter : runTimes) {
+        MNN_PRINT("%s cost time: %.3f ms\n", iter.first.c_str(), iter.second);
+    }
     return wrongs.size();
 }
 
 int MNNTestSuite::runAll(int precision, const char* flag) {
     auto suite = MNNTestSuite::get();
     std::vector<std::string> wrongs;
+    std::map<std::string, float> runTimes;
     for (int i = 0; i < suite->mTests.size(); ++i) {
         MNNTestCase* test = suite->mTests[i];
         if (test->name.find("speed") != std::string::npos) {
@@ -76,18 +83,23 @@ int MNNTestSuite::runAll(int precision, const char* flag) {
             // Don't test for model because need resource
             continue;
         }
-        printf("\trunning %s.\n", test->name.c_str());
+        MNN_PRINT("\trunning %s.\n", test->name.c_str());
+        MNN::Timer _t;
         auto res = test->run(precision);
+        runTimes.insert(std::make_pair(test->name, _t.durationInUs() / 1000.0f));
         if (!res) {
             wrongs.emplace_back(test->name);
         }
     }
     if (wrongs.empty()) {
-        printf("√√√ all tests passed.\n");
+        MNN_PRINT("√√√ all tests passed.\n");
     }
     for (auto& wrong : wrongs) {
-        printf("Error: %s\n", wrong.c_str());
+        MNN_PRINT("Error: %s\n", wrong.c_str());
     }
     printTestResult(wrongs.size(), suite->mTests.size() - wrongs.size(), flag);
+    for (auto& iter : runTimes) {
+        MNN_PRINT("%s cost time: %.3f ms\n", iter.first.c_str(), iter.second);
+    }
     return wrongs.size();
 }
diff --git a/test/core/RegionFuse.cpp b/test/core/RegionFuse.cpp
index 9cabc388f..2820dd09b 100644
--- a/test/core/RegionFuse.cpp
+++ b/test/core/RegionFuse.cpp
@@ -26,16 +26,16 @@ class RegionFuseTest : public MNNTestCase {
             {0, 1, 16, 1, 0, 1, 16, 1, 1, 4, 16},
             // transpose + memcpy = transpose: [1, 4, 16] => [1, 16, 4] => [16, 1, 4]
             {0, 1, 1, 16, 0, 1, 4, 1, 1, 16, 4},
-            {0, 1, 1, 1, 0, 1, 1, 1, 16, 1, 4},
+            {0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 64},
             {0, 1, 1, 16, 0, 1, 4, 1, 1, 16, 4},
             // transpose + transpose' = transpose'': [3, 4, 5] => [5, 3, 4] => [4, 5, 3]
             {0, 1, 1, 5, 0, 1, 12, 1, 1, 5, 12},
             {0, 1, 1, 4, 0, 1, 15, 1, 1, 4, 15},
             {0, 5, 1, 20, 0, 15, 3, 1, 4, 5, 3},
-            // memcpy + memcpy' = memcpy'': offset:2 => offset:3 => offser:6+2-3=5
+            // memcpy + memcpy' = memcpy'': offset:2 => offset:3 => offser:6+2-3=5, clip: range: 3-19 & 6-22 = 6-19, size=13
             {2, 1, 1, 1, 3, 1, 1, 1, 1, 1, 16},
             {6, 1, 1, 1, 0, 1, 1, 1, 1, 1, 16},
-            {5, 1, 1, 1, 0, 1, 1, 1, 1, 1, 16},
+            {5, 1, 1, 1, 0, 1, 1, 1, 1, 1, 13},
             // transpose + slice (offset align) => [3, 3, 4] => [3, 4, 3] => [2, 4, 3]
             {0, 12, 1, 4, 0, 12, 3, 1, 3, 4, 3},
             {12, 36, 3, 1, 0, 24, 3, 1, 1, 8, 3},
@@ -44,10 +44,10 @@ class RegionFuseTest : public MNNTestCase {
             {0, 12, 1, 4, 0, 12, 3, 1, 3, 4, 3},
             {18, 36, 3, 1, 0, 18, 3, 1, 1, 6, 3},
             {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-            // copy + expand (src < dst) => [34491] => [34645] => [34645, 2] <can't fuse!>
+            // copy + expand (src < dst) => [34491] => [34645] => [34645, 2] , clip [34491, 34645] -> [34491, 2]
             {0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 34491},
             {0, 1, 1, 1, 0, 2, 1, 1, 34645, 1, 1},
-            {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+            {0, 1, 1, 1, 0, 1, 1, 2, 1, 1, 34491},
             // transpose + slice: [3, 256, 940] => [3, 940, 256] => [1, 256, 940] (expand_val = 1)
             {0, 240640, 1, 940, 0, 240640, 256, 1, 3, 940, 256},
             {0, 1, 256, 1, 0, 1, 768, 1, 1, 940, 256},
@@ -60,26 +60,54 @@ class RegionFuseTest : public MNNTestCase {
             {0, 1600, 1, 4, 0, 1600, 400, 1, 53, 4, 400},
             {0, 400, 20, 1, 0, 400, 20, 1, 190, 20, 20},
             {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-            // pad + transpose + slice + transpose (not full copy) <can't fuse>
+            // pad + transpose + slice + transpose (not full copy) 
             {0, 12321, 111, 1, 0, 12544, 112, 1, 32, 111, 111},
             {113, 12544, 112, 1, 0, 12321, 111, 1, 32, 111, 111},
-            {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}
+            {112, 12321, 111, 1, 0, 12321, 111, 1, 32, 110, 110}
         };
+        TensorUtils::FuseWrap fuseUtils;
         for (int i = 0; i < N; i++) {
             Region src, dst;
             src.origin = nullptr;
             dst.origin = nullptr;
             ::memcpy(&src, data[3 * i], 44);
             ::memcpy(&dst, data[3 * i + 1], 44);
-            bool fused = TensorUtils::fuseRegion(src, dst);
+            bool fused = fuseUtils.match(src, dst);
+            Region newDst = dst;
+            if (fused) {
+                fuseUtils.apply(src, newDst);
+            }
             if (data[3 * i + 2][0] < 0 && !fused) {
                 continue;
             }
-            int cmp = ::memcmp(&dst, data[3 * i + 2], 44);
-            if (!fused || (cmp != 0)) {
+            if (!fused) {
+                MNN_ERROR("regionfuse %d test failed for fuse!\n", i);
+                return false;
+            }
+            Region target;
+            ::memcpy(&target, data[3 * i + 2], 44);
+            if (target.src.offset != newDst.src.offset || target.dst.offset != newDst.dst.offset) {
                 MNN_ERROR("regionfuse %d test failed!\n", i);
                 return false;
             }
+            int cmp = ::memcmp(&newDst.size, target.size, 3 * sizeof(int));
+            if (cmp != 0) {
+                MNN_ERROR("regionfuse %d test size not match\n", i);
+                return false;
+            }
+            for (int u=0; u<3; ++u) {
+                if (newDst.size[u] == 1) {
+                    continue;
+                }
+                if (newDst.src.stride[u] != target.src.stride[u]) {
+                    MNN_ERROR("regionfuse %d test src stride not match\n", i);
+                    return false;
+                }
+                if (newDst.dst.stride[u] != target.dst.stride[u]) {
+                    MNN_ERROR("regionfuse %d test dst stride not match\n", i);
+                    return false;
+                }
+            }
         }
         return true;
     }
diff --git a/test/expr/MatMulTest.cpp b/test/expr/MatMulTest.cpp
index e768051ba..5bcc97285 100644
--- a/test/expr/MatMulTest.cpp
+++ b/test/expr/MatMulTest.cpp
@@ -49,6 +49,20 @@ static bool checkMatMul(const float* C, const float* A, const float* B, int e, i
     return res;
 }
 
+static void _originMatMul(float* C, const float* A, const float* B, int e, int l, int h) {
+    for (int y = 0; y < e; ++y) {
+        auto AY = A + l * y;
+        auto CY = C + h * y;
+        for (int x = 0; x < h; ++x) {
+            auto BX        = B + x;
+            float expected = 0.0f;
+            for (int k = 0; k < l; ++k) {
+                expected += AY[k] * BX[k * h];
+            }
+            CY[x] = expected;
+        }
+    }
+}
 class MatMulTest : public MNNTestCase {
 public:
     virtual bool run(int precision) {
@@ -288,6 +302,45 @@ class MatMulTest : public MNNTestCase {
                 }
             }
         }
+        {
+            int e = 23;
+            int l = 33;
+            int h = 9;
+            {
+                // Test MatMul
+                std::unique_ptr<MNN::OpT> op(new MNN::OpT);
+                op->type                = MNN::OpType_MatMul;
+                op->main.type           = MNN::OpParameter_MatMul;
+                op->main.value          = new MNN::MatMulT;
+                auto matmulParam        = op->main.AsMatMul();
+                matmulParam->transposeA = false;
+                matmulParam->transposeB = false;
+                
+                auto x0 = _Input({}, NHWC, halide_type_of<float>());
+                auto x1 = _Input({}, NHWC, halide_type_of<float>());
+                x0->resize({e, l});
+                x1->resize({l, h});
+                auto y = Variable::create(Expr::create(op.get(), {x0, x1}));
+                Variable::prepareCompute({y});
+                auto dstY = _Input({e, h}, NHWC, halide_type_of<float>());
+                fillFloat(x0->writeMap<float>(), e, l);
+                fillFloat(x1->writeMap<float>(), l, h);
+                _originMatMul(dstY->writeMap<float>(), x0->readMap<float>(), x1->readMap<float>(), e, l, h);
+                
+                auto absMaxV = _ReduceMax(_Abs(dstY));
+                auto diffV   = _ReduceMax(_Abs(dstY - y));
+                Variable::prepareCompute({absMaxV, diffV}, true);
+                
+                auto absMax = absMaxV->readMap<float>()[0];
+                MNN_ASSERT(absMax != 0.0f);
+                auto diff = diffV->readMap<float>()[0];
+                
+                if (diff > 0.01f * absMax) {
+                    MNN_PRINT("%f error larger than %f * 0.001f\n", diff, absMax);
+                    return false;
+                }
+            }
+        }
         return true;
     }
 };
diff --git a/test/op/ConvInt8Test.cpp b/test/op/ConvInt8Test.cpp
index 68d25ad65..31f716046 100644
--- a/test/op/ConvInt8Test.cpp
+++ b/test/op/ConvInt8Test.cpp
@@ -293,7 +293,7 @@ class ConvInt8Im2colGemmTest : public ConvInt8TestCommon {
         std::vector<std::vector<int>> kernels = {
             {4, 2}, {1, 5}, {7, 1}
         };
-        int iw = 34; int ih = 23;
+        int iw = 24; int ih = 17;
         std::vector<std::string> titles = {"4x2", "1x5", "7x1"};
         for (int sx=1; sx<2; ++sx) {
             for (int sy=1; sy<2; ++sy) {
diff --git a/test/op/ConvolutionTest.cpp b/test/op/ConvolutionTest.cpp
index f8245b0fe..836ace993 100644
--- a/test/op/ConvolutionTest.cpp
+++ b/test/op/ConvolutionTest.cpp
@@ -523,15 +523,10 @@ class ConvolutionInt8CommonTest : public ConvolutionCommonTest {
     virtual void generateWeight(std::vector<float>& weightData, int ic, int oc, int kh, int kw, int dilation, int group, int sparseBlockOC) {
         auto numbers = group * (oc / group) * (ic / group) * kw * kh;
         weightData.resize(numbers);
-        float rate = 1.0f;
-        if (numbers > 10000) {
-            // Avoid exceed fp16
-            rate = 0.01f;
-        }
+        float rate = 1.0f / numbers;
         for (int ri = 0; ri < numbers; ri++) {
-            int i = numbers - ri;
-            auto data      = ((((i / kw)% 1317) * ((i / kh) % 1317)) % 1317 + i / ic + i / oc + (((oc - i) % 1317) * ic) % 1317 + i * ((oc - i) % 1317)) % 1317;
-            auto floatData      = (float)(data % 255) / 255.0f / 1000.0f * rate;
+            int data = ri - numbers / 2;
+            auto floatData = (float)(data) * rate;
             weightData[ri] = data;
         }
     }
@@ -629,28 +624,6 @@ class ConvolutionInt8CommonTest : public ConvolutionCommonTest {
         ::memcpy(input->writeMap<float>(), inputData.data(), inputData.size() * sizeof(float));
         // Single Conv
         auto weightLength = weightData.size();
-        auto output     = _HybridConv(weightData, std::move(biasData), std::move(wScale), input,
-                                      {ic, oc}, {kw, kh}, padMap[mode],  {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false, nbit, async);
-
-        // difference below 0.5% relative error is considered correct.
-        auto outputPtr = output->readMap<float>();
-
-        if (debug) {
-            MNN_PRINT("\ndata NCHW shape:");
-            printDims(input->getInfo()->dim);
-            MNN_PRINT("\nweight OIHW shape:");
-            printDims({oc, ic, kh, kw});
-            MNN_PRINT("\noutput NCHW shape:");
-            printDims(output->getInfo()->dim);
-            MNN_PRINT("\nexpected output:");
-            formatMatrix(outputData.data(), output->getInfo()->dim);
-            MNN_PRINT("\nexpected output 2:");
-            formatMatrix(outputDataSeparateBias.data(), output->getInfo()->dim);
-            MNN_PRINT("\nreal output:");
-            formatMatrix(outputPtr, output->getInfo()->dim);
-        }
-        // when using low precision, im2col or strassen convolution error rate to reference value is about 1e-4, winograd has larger error rate.
-
         float errorScale = 1.0f;
         if (nbit == 4 && weightLength > 10000) {
             errorScale = 50.0f;
@@ -658,29 +631,45 @@ class ConvolutionInt8CommonTest : public ConvolutionCommonTest {
         if (precision > MNN::BackendConfig::Precision_High) {
             errorScale = 100.0f;
         }
-        if (!checkVectorByRelativeError<float>(outputPtr, outputData.data(), outputDataSeparateBias.data(), outputData.size(), 0.001 * errorScale)) {
-            MNN_PRINT("precision:%d, expect:\t expect2:\t real:\t\n", precision);
-            for (int i = 0; i < outputData.size(); ++i)
-            {
-                MNN_PRINT("%f\t, %f\t, %f\n", outputData[i],outputDataSeparateBias[i], outputPtr[i]);
+        std::vector<std::pair<bool, bool>> activations = {
+            {false, false},
+            {true, false},
+            {false, true}
+        };
+        for (auto& activation : activations) {
+            auto output     = _HybridConv(weightData, biasData, wScale, input,
+                                          {ic, oc}, {kw, kh}, padMap[mode],  {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, activation.first, activation.second, nbit, async);
+            auto toutputData = outputData;
+            float maxV = -10000.0f;
+            float minV = 10000.0f;
+            if (activation.first) {
+                for (auto& t : toutputData) {
+                    maxV = ALIMAX(maxV, t);
+                    minV = ALIMIN(minV, t);
+                    t = ALIMAX(0.0f, t);
+                }
+//                MNN_PRINT("Max: %f -> Min:%f\n", maxV, minV);
+            }
+            if (activation.second) {
+                for (auto& t : toutputData) {
+                    t = ALIMAX(0.0f, t);
+                    t = ALIMIN(6.0f, t);
+                }
             }
-            MNN_ERROR("%s(%s) test failed for %d bits, async=%d !\n", test_op_name.c_str(), device_name.c_str(), nbit, async);
-            return false;
-        }
 
+            // difference below 0.5% relative error is considered correct.
+            auto outputPtr = output->readMap<float>();
+            // when using low precision, im2col or strassen convolution error rate to reference value is about 1e-4, winograd has larger error rate.
 
-        if (mBenchSpeed) {
-            int oh = output->getInfo()->dim[2], ow = output->getInfo()->dim[3];
-            input.fix(VARP::INPUT);
-            MNN::Timer _t;
-            const int LOOP = 20;
-            for (int i = 0; i < LOOP; ++i) {
-                input->writeMap<float>();
-                output->readMap<float>();
+            if (!checkVectorByRelativeError<float>(outputPtr, toutputData.data(), toutputData.data(), toutputData.size(), 0.001 * errorScale)) {
+                MNN_PRINT("precision:%d, expect:\t expect2:\t real:\t\n", precision);
+                for (int i = 0; i < toutputData.size(); ++i)
+                {
+                    MNN_PRINT("%f\t, %f\t, %f\n", toutputData[i],outputDataSeparateBias[i], outputPtr[i]);
+                }
+                MNN_ERROR("%s(%s) test failed for %d bits, async=%d , relu: %d, relu6: %d!\n", test_op_name.c_str(), device_name.c_str(), nbit, async, activation.first, activation.second);
+                return false;
             }
-            auto time = (float)_t.durationInUs() / 1000.0f;
-            MNN_PRINT("ConvInt8Weight kernel=(%dx%d) input=(1x%dx%dx%d) output=(1x%dx%dx%d) stride=(%dx%d), avg time = %f\n",
-                      kh, kw, ic, ih, iw, oc, oh, ow, stride, stride, 1.0 * time / LOOP);
         }
         return true;
     }
@@ -743,13 +732,24 @@ class ConvolutionTest : public ConvolutionType {
     virtual ~ConvolutionTest() = default;
 
 protected:
-    static bool test(MNNForwardType type, const std::string& device_name, int precision, MNN::SparseAlgo sparseAlgo, std::vector<int> blocks) {
-
+    static bool test(MNNForwardType type, const std::string& device_name, int precision, MNN::SparseAlgo sparseAlgo, std::vector<int> blocks, bool checkSpectial = false) {
+        int ocStep = 1;
+        int icStep = 1;
+        int isStep = 3;
+        std::vector<int> ocSize = {
+            1, 3, 10, 17
+        };
+        std::vector<int> icSize = {
+            1, 3, 10, 17
+        };
+        std::vector<int> isSize = {
+            1, 7, 9
+        };
 
         for (int b = 1; b <= 2; b++) {
-            for (int oc = 1; oc <= 17; oc += 4) {
-                for (int ic = 1; ic <= 18; ic += 5) {
-                    for (int is = 1; is <= 17; is += 3) {
+            for (auto oc : ocSize) {
+                for (auto ic : icSize) {
+                    for (auto is : isSize) {
                         for (int kw = 1; kw <= 3 && kw <= is; kw+=2) {
                             for (int kh = 1; kh <= 3 && kh <= is; kh+=3) {
                                 for (int d = 1; d <= 2; d++) {
@@ -806,6 +806,9 @@ class ConvolutionTest : public ConvolutionType {
                 }
             }
         }
+        if (!checkSpectial) {
+            return true;
+        }
         // Check Long convolution
          bool succ =
             ConvolutionType().test(type, device_name, "Conv2D", 1, 256, 256, 24, 24, PadMode_SAME, 0, 0, 3, 3, 1, 1, 1, precision, sparseAlgo, 4, false);
@@ -844,7 +847,7 @@ class ConvolutionTestOnCPU : public DenseConvolutionTest {
 public:
     ~ConvolutionTestOnCPU() = default;
     virtual bool run(int precision) {
-        return DenseConvolutionTest::test(MNN_FORWARD_CPU, "CPU", precision, MNN::SparseAlgo_RANDOM, {1});
+        return DenseConvolutionTest::test(MNN_FORWARD_CPU, "CPU", precision, MNN::SparseAlgo_RANDOM, {1}, true);
     }
 };
 
diff --git a/test/op/ZerosLikeTest.cpp b/test/op/ZerosLikeTest.cpp
index fe383735d..4cae5e023 100644
--- a/test/op/ZerosLikeTest.cpp
+++ b/test/op/ZerosLikeTest.cpp
@@ -31,6 +31,17 @@ class ZerosLikeTest : public MNNTestCase {
             MNN_ERROR("ZerosLikeTest test failed!\n");
             return false;
         }
+        output = _ZerosLike(input);
+        auto o2 = _Stack({output, output});
+        auto o2ptr = o2->readMap<float>();
+        if (!checkVector<float>(o2ptr, expectedOutput.data(), 16, 0.01)) {
+            MNN_ERROR("ZerosLikeTest test concat0 failed!\n");
+            return false;
+        }
+        if (!checkVector<float>(o2ptr + 16, expectedOutput.data(), 16, 0.01)) {
+            MNN_ERROR("ZerosLikeTest test concat1 failed!\n");
+            return false;
+        }
         return true;
     }
     virtual bool run(int precision) {
diff --git a/tools/converter/include/config.hpp b/tools/converter/include/config.hpp
index b5ada4f79..befc9d51f 100644
--- a/tools/converter/include/config.hpp
+++ b/tools/converter/include/config.hpp
@@ -39,6 +39,7 @@ class MNN_PUBLIC modelConfig {
     bool forTraining = false;
     int weightQuantBits = 0;// If weightQuantBits > 0, it means the bit
     bool weightQuantAsymmetric = true;
+    int weightQuantBlock = -1;
     // The path of the model compression file that stores the int8 calibration table
     // or sparse parameters.
     std::string compressionParamsFile = "";
diff --git a/tools/converter/source/common/WeightQuantAndCoding.cpp b/tools/converter/source/common/WeightQuantAndCoding.cpp
index 3fd718dfb..81526ff9f 100644
--- a/tools/converter/source/common/WeightQuantAndCoding.cpp
+++ b/tools/converter/source/common/WeightQuantAndCoding.cpp
@@ -82,6 +82,8 @@ void WeightQuantAndCoding(std::unique_ptr<MNN::OpT>& op, const modelConfig& conf
     }
     int kernelNum = common->outputCount;
     int kernelSize = weightSize / kernelNum;
+    int kxky = common->kernelX * common->kernelY;
+    int icCount = kernelSize / kxky;
 
     bool asymmetricQuantFlag = config.weightQuantAsymmetric;
 
@@ -91,7 +93,12 @@ void WeightQuantAndCoding(std::unique_ptr<MNN::OpT>& op, const modelConfig& conf
         clampMin = -threshold - 1;
     }
     std::vector<float> weightData, scales;
-    std::vector<int8_t> quantWeights;
+    // block-wise quant
+    int block_size = kernelSize, block_num = 1;
+    if (config.weightQuantBlock > 0 && (kernelSize % config.weightQuantBlock == 0) && kxky == 1) {
+        block_size = config.weightQuantBlock;
+        block_num = kernelSize / block_size;
+    }
 
     switch (opType) {
         case MNN::OpType_Convolution:
@@ -99,41 +106,32 @@ void WeightQuantAndCoding(std::unique_ptr<MNN::OpT>& op, const modelConfig& conf
         case MNN::OpType_Deconvolution:
         case MNN::OpType_DeconvolutionDepthwise: {
             weightData = std::move(param->weight);
-
             if (asymmetricQuantFlag) {
-                scales.resize(kernelNum*2);
+                scales.resize(kernelNum * block_num * 2);
                 for (int k = 0; k < kernelNum; k++) {
-                    int beginIndex = k * kernelSize;
-                    auto minAndMax = findMinMax(weightData.data() + beginIndex, kernelSize);
-                    float min = minAndMax[0];
-                    float max = minAndMax[1];
-                    float scale = (max - min) / (threshold - clampMin);
-
-                    scales[2*k] = min;
-                    scales[2*k+1] = scale;
-
-                    for (int ii = 0; ii < kernelSize; ii++) {
-                        float* ptr = weightData.data() + beginIndex;
-                        int8_t quantValue = int8_t(std::round((ptr[ii] - min) / scale + clampMin));
-                        quantWeights.emplace_back(quantValue);
+                    for (int b = 0; b < block_num; b++) {
+                        int beginIndex = k * kernelSize + b * block_size;
+                        auto minAndMax = findMinMax(weightData.data() + beginIndex, block_size);
+                        float min = minAndMax[0];
+                        float max = minAndMax[1];
+                        float scale = (max - min) / (threshold - clampMin);
+
+                        int scaleIndex = k * block_num + b;
+                        scales[2 * scaleIndex] = min;
+                        scales[2 * scaleIndex + 1] = scale;
                     }
                 }
             } else {
-                scales.resize(kernelNum);
+                scales.resize(kernelNum * block_num);
                 for (int k = 0; k < kernelNum; k++) {
-                    int beginIndex = k * kernelSize;
-                    auto absMax = findAbsMax(weightData.data() + beginIndex, kernelSize);
-
-                    scales[k] = absMax / threshold;
-
-                    for (int ii = 0; ii < kernelSize; ii++) {
-                        float* ptr = weightData.data() + beginIndex;
-                        int8_t quantValue = int8_t(std::round(ptr[ii] / scales[k]));
-                        quantWeights.emplace_back(quantValue);
+                    for (int b = 0; b < block_num; b++) {
+                        int beginIndex = k * kernelSize + b * block_size;
+                        auto absMax = findAbsMax(weightData.data() + beginIndex, block_size);
+                        int scaleIndex = k * block_num + b;
+                        scales[scaleIndex] = absMax / threshold;
                     }
                 }
             }
-
             break;
         }
         case MNN::OpType_ConvInt8:
@@ -150,12 +148,14 @@ void WeightQuantAndCoding(std::unique_ptr<MNN::OpT>& op, const modelConfig& conf
             break;
     }
 
+    kernelSize = block_size;
+    kernelNum = kernelNum * block_num;
     if (opType == MNN::OpType_ConvInt8 || opType == MNN::OpType_DepthwiseConvInt8) {
         param->quanParameter = IDSTEncoder::encode(weightData.data(), scales, kernelSize, kernelNum, false, param->symmetricQuan->weight.data(), int(clampMin), bits);
         param->symmetricQuan->weight.clear();
         param->quanParameter->alpha = {1.0f}; // fake scales
     } else {
-        param->quanParameter = IDSTEncoder::encode(weightData.data(), scales, kernelSize, kernelNum, asymmetricQuantFlag, quantWeights.data(), int(clampMin), bits, config.detectSparseSpeedUp);
+        param->quanParameter = IDSTEncoder::encode(weightData.data(), scales, kernelSize, kernelNum, asymmetricQuantFlag, nullptr, int(clampMin), bits, config.detectSparseSpeedUp);
         param->weight.clear();
         std::vector<float> empty;
         param->weight.swap(empty);
diff --git a/tools/converter/source/common/cli.cpp b/tools/converter/source/common/cli.cpp
index c2a64a8ee..ffe8c8ae9 100644
--- a/tools/converter/source/common/cli.cpp
+++ b/tools/converter/source/common/cli.cpp
@@ -205,6 +205,11 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv
      "but asymmetric quant model cannot run on old MNN versions. You will need to upgrade MNN to new version to solve this problem. default: false",
      cxxopts::value<bool>()
      )
+    (
+     "weightQuantBlock",
+     "using block-wise weight quant, set block size, defaut: -1, which means channel-wise weight quant",
+     cxxopts::value<int>()
+     )
     (
      "compressionParamsFile",
      "The path of the compression parameters that stores activation, "
@@ -437,7 +442,10 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv
         modelPath.weightQuantBits = result["weightQuantBits"].as<int>();
     }
     if (result.count("weightQuantAsymmetric")) {
-        modelPath.weightQuantAsymmetric = true;
+        modelPath.weightQuantAsymmetric = result["weightQuantAsymmetric"].as<bool>();
+    }
+    if (result.count("weightQuantBlock")) {
+        modelPath.weightQuantBlock = result["weightQuantBlock"].as<int>();
     }
     if (result.count("saveStaticModel")) {
         modelPath.saveStaticModel = true;
diff --git a/tools/converter/source/common/convertToStaticModel.cpp b/tools/converter/source/common/convertToStaticModel.cpp
index bfce390cf..2d8c120db 100644
--- a/tools/converter/source/common/convertToStaticModel.cpp
+++ b/tools/converter/source/common/convertToStaticModel.cpp
@@ -332,7 +332,7 @@ void converToStaticModel(const Net* net, std::map<std::string,std::vector<int>>&
     }
     std::vector<Schedule::OpCacheInfo> infos;
     initPipelineInfosFromNet(infos, net, allTensors);
-    GeometryComputer::Context ctx(defaultBackend);
+    GeometryComputer::Context ctx(Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_ALL, defaultBackend);
     // resize the session's info and store to buffer
     std::vector<Tensor*> constTensors;
     GeometryComputerUtils::buildConstantTensors(infos);
diff --git a/tools/converter/source/optimizer/merge/FuseAttention.cpp b/tools/converter/source/optimizer/merge/FuseAttention.cpp
index c0b69061c..8c2058855 100644
--- a/tools/converter/source/optimizer/merge/FuseAttention.cpp
+++ b/tools/converter/source/optimizer/merge/FuseAttention.cpp
@@ -22,6 +22,22 @@ class FuseAttention {
     VARP query, key, value, mask;
 };
 
+static EXPRP is_gqa(EXPRP& x) {
+    if (!helpers::IsReshape(x)) {
+        return x;
+    }
+    auto y = x->inputs().at(0)->expr().first;
+    if (!helpers::IsBroadcastTo(y)) {
+        return x;
+    }
+    y = y->inputs().at(0)->expr().first;
+    if (!helpers::IsUnsqueeze(y)) {
+        return x;
+    }
+    y = y->inputs().at(0)->expr().first;
+    return y;
+}
+
 FuseAttention::FuseAttention() {
     auto match = [this](EXPRP expr) -> bool {
         auto config = Global<modelConfig>::Get();
@@ -44,9 +60,9 @@ FuseAttention::FuseAttention() {
         if (!helpers::IsMatMul(matmul)) {
             return false;
         }
-
-        // transpose
         y = matmul->inputs().at(1)->expr().first;
+        y = is_gqa(y);
+        // transpose
         if (!helpers::IsTranspose(y)) {
             return false;
         }
@@ -98,8 +114,9 @@ FuseAttention::FuseAttention() {
         // query
         query = z->inputs().at(0);
 
-        // transpose
         y = x->inputs().at(1)->expr().first;
+        // transpose
+        y = is_gqa(y);
         if (!helpers::IsTranspose(y)) {
             return false;
         }
@@ -120,6 +137,9 @@ FuseAttention::FuseAttention() {
             // For target version < 2.8 , don't support fmha_v2
             return false;
         }
+        if (expr->name().size() > 0) {
+            MNN_PRINT("Fuse Attention as %s\n", expr->name().c_str());
+        }
 
         std::unique_ptr<OpT> attention(new OpT);
         attention->name       = "Attention" + expr->name();
@@ -206,6 +226,9 @@ RemovePastKeyValue::RemovePastKeyValue() {
             // For target version < 2.8 , don't support fmha_v2
             return false;
         }
+        if (!expr->name().empty()) {
+            MNN_PRINT("Remove past KV for %s\n", expr->name().c_str());
+        }
 
         // past-kv remove
         std::unique_ptr<OpT> reshape(new OpT);
diff --git a/tools/converter/source/optimizer/merge/MergeHelpers.cpp b/tools/converter/source/optimizer/merge/MergeHelpers.cpp
index b49201c0e..4f4401226 100644
--- a/tools/converter/source/optimizer/merge/MergeHelpers.cpp
+++ b/tools/converter/source/optimizer/merge/MergeHelpers.cpp
@@ -167,6 +167,11 @@ bool IsExpandDims(EXPRP expr) {
     return op && op->type() == OpType_ExpandDims;
 }
 
+bool IsBroadcastTo(EXPRP expr) {
+    const Op* op = expr->get();
+    return op && op->type() == OpType_BroadcastTo;
+}
+
 EXPRP InputExpr(EXPRP expr, int input_index) {
     return expr->inputs().at(input_index)->expr().first;
 }
diff --git a/tools/converter/source/optimizer/merge/MergeHelpers.hpp b/tools/converter/source/optimizer/merge/MergeHelpers.hpp
index bfc720c97..a4515ab0e 100644
--- a/tools/converter/source/optimizer/merge/MergeHelpers.hpp
+++ b/tools/converter/source/optimizer/merge/MergeHelpers.hpp
@@ -48,6 +48,7 @@ bool IsReductionMean(Express::EXPRP expr);
 bool IsConvolution(Express::EXPRP expr);
 
 bool IsExpandDims(Express::EXPRP expr);
+bool IsBroadcastTo(Express::EXPRP expr);
 
 Express::EXPRP InputExpr(Express::EXPRP expr, int input_index);
 Express::EXPRP OutputExpr(Express::EXPRP expr, int output_index);
diff --git a/tools/converter/source/optimizer/postconvert/RemoveInvalidCast.cpp b/tools/converter/source/optimizer/postconvert/RemoveInvalidCast.cpp
index db3d0e86c..4f4001b00 100644
--- a/tools/converter/source/optimizer/postconvert/RemoveInvalidCast.cpp
+++ b/tools/converter/source/optimizer/postconvert/RemoveInvalidCast.cpp
@@ -12,6 +12,7 @@
 #include <string>
 #include <algorithm>
 #include "../PostTreatUtils.hpp"
+#include <MNN/MNNDefine.h>
 using namespace MNN;
 class RemoveInvalidCast : public PostConverter {
 public:
@@ -67,6 +68,20 @@ class RemoveInvalidCast : public PostConverter {
                 case MNN::OpType_Cast:
                     types[op->outputIndexes[0]] = op->main.AsCastParam()->dstT;
                     break;
+                // Float Op
+                case MNN::OpType_PReLU:
+                case MNN::OpType_Softmax:
+                case MNN::OpType_Convolution:
+                case MNN::OpType_ConvolutionDepthwise:
+                case MNN::OpType_Convolution3D:
+                case MNN::OpType_Deconvolution:
+                case MNN::OpType_DeconvolutionDepthwise:
+                case MNN::OpType_MatMul:
+                    if (op->outputIndexes.size() == 1) {
+                        // 4 is integer matmul
+                        types[op->outputIndexes[0]] = MNN::DataType_DT_FLOAT;
+                    }
+                    break;
                 case MNN::OpType_Const:
                 case MNN::OpType_TrainableParam:
                     types[op->outputIndexes[0]] = op->main.AsBlob()->dataType;
@@ -74,6 +89,13 @@ class RemoveInvalidCast : public PostConverter {
                 case MNN::OpType_Fill:
                     types[op->outputIndexes[0]] = types[op->inputIndexes[1]];
                     break;
+                case MNN::OpType_Slice:
+                case MNN::OpType_SliceTf:
+                case MNN::OpType_Unpack:
+                    for (auto v : op->outputIndexes) {
+                        types[v] = types[op->inputIndexes[0]];
+                    }
+                    break;
                 case MNN::OpType_Shape:
                 case MNN::OpType_Size:
                 case MNN::OpType_Rank:
@@ -111,12 +133,33 @@ class RemoveInvalidCast : public PostConverter {
                     }
                 }
                     break;
+                // Deform
+                case MNN::OpType_Broastcast:
+                case MNN::OpType_Concat:
+                case MNN::OpType_Crop:
+                case MNN::OpType_CropAndResize:
+                case MNN::OpType_Col2Im:
+                case MNN::OpType_DepthToSpace:
+                case MNN::OpType_ExpandDims:
+                case MNN::OpType_Flatten:
+                case MNN::OpType_Interp:
+                case MNN::OpType_Interp3D:
+                case MNN::OpType_Im2Col:
+                case MNN::OpType_Pack:
+                case MNN::OpType_Padding:
+                case MNN::OpType_Permute:
+                case MNN::OpType_Reshape:
+                case MNN::OpType_Resize:
+                case MNN::OpType_StridedSlice:
+                case MNN::OpType_SpaceToDepth:
+                case MNN::OpType_Squeeze:
+                case MNN::OpType_Transpose:
+                case MNN::OpType_Unsqueeze:
+                {
+                    types[op->outputIndexes[0]] = types[op->inputIndexes[0]];
+                }
+                    break;
                 default:
-                    if (op->inputIndexes.size() > 0) {
-                        for (int i=0; i<op->outputIndexes.size(); ++i) {
-                            types[op->outputIndexes[i]] = types[op->inputIndexes[0]];
-                        }
-                    }
                     break;
             }
         }
@@ -134,7 +177,7 @@ class RemoveInvalidCast : public PostConverter {
             }
             if (types[op->inputIndexes[0]] != types[op->outputIndexes[0]]) {
                 iter++;
-                break;
+                continue;
             }
             if (std::find(net->outputName.begin(), net->outputName.end(), net->tensorName[op->outputIndexes[0]]) != net->outputName.end()) {
                 iter++;
diff --git a/tools/cpp/ExprDebug.hpp b/tools/cpp/ExprDebug.hpp
index 0665f193c..ce342a0dd 100644
--- a/tools/cpp/ExprDebug.hpp
+++ b/tools/cpp/ExprDebug.hpp
@@ -1,6 +1,8 @@
 #include <cmath>
 #include <fstream>
 #include <sstream>
+#include <MNN/AutoTime.hpp>
+#include <MNN/expr/ExecutorScope.hpp>
 #define DUMP_NUM_DATA(type)                          \
     auto data = tensor->host<type>();                \
     for (int z = 0; z < outside; ++z) {              \
@@ -125,7 +127,7 @@ static void _initDebug() {
         }
         return true;
     };
-    MNN::Express::Executor::getGlobalExecutor()->setCallBack(std::move(beforeCallBack), std::move(callBack));
+    MNN::Express::ExecutorScope::Current()->setCallBack(std::move(beforeCallBack), std::move(callBack));
 }
 
 
@@ -170,7 +172,7 @@ static void _initTimeTrace() {
         gTimeTraceInfo->end(info);
         return true;
     };
-    MNN::Express::Executor::getGlobalExecutor()->setCallBack(std::move(beforeCallBack), std::move(callBack));
+    MNN::Express::ExecutorScope::Current()->setCallBack(std::move(beforeCallBack), std::move(callBack));
 }
 
 template<typename T>
@@ -274,5 +276,5 @@ static void _initTensorStatic() {
         }
         return true;
     };
-    MNN::Express::Executor::getGlobalExecutor()->setCallBack(std::move(beforeCallBack), std::move(callBack));
+    MNN::Express::ExecutorScope::Current()->setCallBack(std::move(beforeCallBack), std::move(callBack));
 }
diff --git a/tools/cpp/ModuleBasic.cpp b/tools/cpp/ModuleBasic.cpp
index aacb1d4d6..c9b4e93ad 100644
--- a/tools/cpp/ModuleBasic.cpp
+++ b/tools/cpp/ModuleBasic.cpp
@@ -232,6 +232,13 @@ int main(int argc, char *argv[]) {
         // Need tensor static for each op, open debug
         rtmgr->setMode(Interpreter::Session_Debug);
     }
+    // For Debug
+    if (false) {
+        int geometryMask = Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_ALL;
+        geometryMask -= Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_FUSEREGION;
+        geometryMask -= Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_OPENCACHE;
+        rtmgr->setHint(Interpreter::GEOMETRY_COMPUTE_MASK, geometryMask);
+    }
     if (runMask & 4) {
         // Need time trace for each op, open debug
         rtmgr->setMode(Interpreter::Session_Debug);
diff --git a/tools/script/apply_gptq.py b/tools/script/apply_gptq.py
new file mode 100644
index 000000000..06e81c965
--- /dev/null
+++ b/tools/script/apply_gptq.py
@@ -0,0 +1,187 @@
+import json
+import torch
+import argparse
+
+class MNNWeight:
+    def __init__(self, name, external, a_min):
+        self.name = name
+        self.external = external
+        self.a_min = a_min
+        self.parse_name()
+
+    def __repr__(self) -> str:
+        return f'{self.layer_id}.{self.op_id}.{self.block_id}, {self.external}'
+
+    def parse_name(self):
+        parts = self.name.split('/')
+        if len(parts) > 4:
+            self.layer_id = parts[1].split('.')[1]
+            self.op_id = parts[2] + '.' + parts[3]
+            self.block_id = parts[-1].split('__')[-1]
+        else:
+            self.layer_id = -1
+            self.op_id = parts[2]
+            self.block_id = parts[-1].split('__')[-1]
+
+    def key(self): return f'{self.layer_id}.{self.op_id}'
+    def idx(self): return int(self.block_id)
+    def offset(self): return self.external[0]
+    def weight_size(self): return self.external[1]
+    def scale_size(self): return self.external[2]
+
+def weight_reorder(qweight, bits=4, group_size=128):
+    oc = qweight.shape[-1]
+    wf = torch.tensor(list(range(0, 32, bits)), dtype=torch.int32).unsqueeze(0)
+    weight = torch.bitwise_right_shift(torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1), wf.unsqueeze(-1)).to(torch.int16 if bits == 8 else torch.int8)
+    torch.bitwise_and(weight, (2 ** bits) - 1, out=weight)
+    weight = weight.reshape(-1, oc).transpose(1, 0)
+    weight = weight.reshape(-1, 2).to(torch.uint8)
+    weight = weight[:, 0] * 16 + weight[:, 1]
+    return weight
+
+class MNNModel:
+    def __init__(self, model, weight):
+        self.mnn_graph = json.load(open(model, 'rt'))
+        self.external_weight = weight
+        self.parse_conv()
+
+    def parse_conv(self):
+        self.weights = []
+        for op in self.mnn_graph['oplists']:
+            if op['type'] == 'Convolution':
+                name = op['name']
+                external = op['main']['external']
+                a_min = op['main']['quanParameter']['aMin']
+                self.weights.append(MNNWeight(name, external, a_min))
+
+    def apply_weight_split(self, gptq_tensor):
+        bin_file = open(self.external_weight, 'r+b')
+        for mnn_weight in self.weights:
+            idx = mnn_weight.idx()
+            gptq_weight = gptq_tensor.get(mnn_weight.key())
+            if gptq_weight is None: continue
+            print(f'write {mnn_weight.key()}.{idx} ... ', end='')
+            weight = gptq_weight.weight(idx)
+            scale = gptq_weight.scale(idx).float()
+            # write weight data
+            weight = weight_reorder(weight)
+            weight_bytes = weight.numpy().tobytes()
+            weight_size = mnn_weight.weight_size()
+            header_len = weight_size - len(weight_bytes)
+            assert(header_len > 0)
+            bin_file.seek(mnn_weight.offset() + header_len)
+            bin_file.write(weight_bytes)
+            scale_size = mnn_weight.scale_size()
+            is_asy = scale.numel() * scale.element_size() < scale_size
+            # write scale data
+            if is_asy:
+                zeros = mnn_weight.a_min * scale
+                scale = torch.stack([zeros, scale], axis=-1)
+            scale_bytes = scale.numpy().tobytes()
+            assert(scale_size == len(scale_bytes))
+            bin_file.write(scale_bytes)
+            print('Done!')
+            # break
+        bin_file.close()
+
+    def apply_weight(self, gptq_tensor):
+        bin_file = open(self.external_weight, 'r+b')
+        for mnn_weight in self.weights:
+            gptq_weight = gptq_tensor.get(mnn_weight.key())
+            if gptq_weight is None: continue
+            print(f'write {mnn_weight.key()} ... ', end='')
+            weight = gptq_weight.qweight
+            scale = gptq_weight.scales.float().transpose(1, 0)
+            # write weight data
+            weight = weight_reorder(weight)
+            weight_bytes = weight.numpy().tobytes()
+            weight_size = mnn_weight.weight_size()
+            header_len = weight_size - len(weight_bytes)
+            assert(header_len > 0)
+            bin_file.seek(mnn_weight.offset() + header_len)
+            bin_file.write(weight_bytes)
+            scale_size = mnn_weight.scale_size()
+            is_asy = scale.numel() * scale.element_size() < scale_size
+            # write scale data
+            if is_asy:
+                zeros = mnn_weight.a_min * scale
+                scale = torch.stack([zeros, scale], axis=-1)
+            scale_bytes = scale.numpy().tobytes()
+            assert(scale_size == len(scale_bytes))
+            bin_file.write(scale_bytes)
+            print('Done!')
+        bin_file.close()
+
+    def apply(self, gptq_tensor):
+        if self.weights[0].block_id.isdigit():
+            self.apply_weight_split(gptq_tensor)
+        else:
+            self.apply_weight(gptq_tensor)
+
+class GPTQWeight:
+    def __init__(self, name):
+        self.name = name
+
+    def __repr__(self) -> str:
+        if hasattr(self, 'qweight'):
+            return f'{self.name}, {self.qweight.shape}, {self.scales.shape}'
+        return 'None'
+
+    def add(self, name, tensor):
+        setattr(self, name, tensor)
+
+    def weight(self, idx):
+        shape = self.qweight.shape
+        if len(shape) == 2:
+            ic, oc = shape
+            self.qweight = self.qweight.reshape(ic//16, 16, oc)
+        return self.qweight[idx]
+
+    def scale(self, idx):
+        return self.scales[idx]
+
+class GPTQTensor:
+    def __init__(self, file):
+        self.file = file
+        self.load()
+
+    def prefix(self, name):
+        splits = name.split('.')
+        if len(splits) < 5:
+            return None, None
+        pre = f'{splits[2]}.{splits[3]}.{splits[4]}'
+        suf = splits[-1]
+        return pre, suf
+
+    def __repr__(self) -> str:
+        return self.weight_dict.__repr__()
+
+    def get(self, key : str):
+        if key in self.weight_dict:
+            return self.weight_dict[key]
+        return None
+
+    def load(self):
+        self.weight_dict = dict()
+        from safetensors import safe_open
+        with safe_open(self.file, framework="pt") as f:
+            for k in f.keys():
+                p, s = self.prefix(k)
+                if p is None: continue
+                if s not in ['qweight', 'scales']: continue
+                if p not in self.weight_dict:
+                    self.weight_dict[p] = GPTQWeight(p)
+                self.weight_dict[p].add(s, f.get_tensor(k))
+
+def main(args):
+    mnn_model = MNNModel(args.mnn_graph, args.mnn_weight)
+    gptq_weight = GPTQTensor(args.gptq_tensor)
+    mnn_model.apply(gptq_weight)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='apply_gptq', formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('--mnn_graph', type=str, required=True, help='mnn graph json path.')
+    parser.add_argument('--mnn_weight', type=str, required=True, help='mnn weight file path.')
+    parser.add_argument('--gptq_tensor', type=str, required=True, help='gptq tensor path.')
+    args = parser.parse_args()
+    main(args)
diff --git a/transformers/llm/config.json b/transformers/llm/config.json
new file mode 100755
index 000000000..7025fad4b
--- /dev/null
+++ b/transformers/llm/config.json
@@ -0,0 +1,9 @@
+{
+    "llm_model": "llm.mnn",
+    "llm_weight": "llm.mnn.weight",
+
+    "backend_type": "cpu",
+    "thread_num": 4,
+    "precision": "low",
+    "memory": "low"
+}
diff --git a/transformers/llm/engine/include/llm.hpp b/transformers/llm/engine/include/llm.hpp
index 306cd66f9..4754010cf 100644
--- a/transformers/llm/engine/include/llm.hpp
+++ b/transformers/llm/engine/include/llm.hpp
@@ -11,6 +11,8 @@
 #include <vector>
 #include <memory>
 #include <string>
+#include <fstream>
+#include <sstream>
 #include <iostream>
 #include <streambuf>
 #include <functional>
@@ -21,9 +23,11 @@
 #include <MNN/expr/MathOp.hpp>
 #include <MNN/expr/NeuralNetWorkOp.hpp>
 #include "tokenizer.hpp"
+#include "rapidjson/document.h"
 
 using namespace MNN;
 using namespace Express;
+using namespace rapidjson;
 class Tokenizer;
 class Pipeline;
 
@@ -46,358 +50,293 @@ class LlmStreamBuffer : public std::streambuf {
     CallBack callback_ = nullptr;
 };
 
-class MNN_PUBLIC Llm {
-public:
-    Llm() {
-        // default tokenier is senrencepiece
-        tokenizer_.reset(new Sentencepiece);
+static inline bool has_suffix(const std::string& str, const std::string& suffix) {
+    return str.size() >= suffix.size() &&
+           str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
+}
+
+static inline std::string base_dir(const std::string& path) {
+    size_t pos = path.find_last_of("/\\");
+    if (pos == std::string::npos) {
+        return "./";
+    } else {
+        return path.substr(0, pos + 1);
     }
-    virtual ~Llm() {
-        decode_modules_.clear();
-        prefill_modules_.clear();
-        modules_.clear();
-        visual_module_.reset();
-        runtime_manager_.reset();
+}
+
+static inline std::string file_name(const std::string& path) {
+    size_t pos = path.find_last_of("/\\");
+    if (pos == std::string::npos) {
+        return path;
+    } else {
+        return path.substr(pos + 1);
     }
-    // Default memory is low, precision is low
-    static Llm* createLLM(const std::string& path, std::string model_type = "auto", int forwardType = 0, int memoryprecison = 10);
-    void load(const std::string& model_dir);
-    void chat();
-    void trace(bool start);
-    void warmup();
-    std::string response(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr);
-    std::string response_nohistory(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr);
-    float load_progress() { return load_progress_; }
-    void reset();
-    void print_speed();
-    friend class Pipeline;
-public:
-    std::vector<int> history_;
-    // forward info
-    int max_seq_len_ = 1024;
-    int prompt_len_ = 0;
-    int gen_seq_len_ = 0;
-    int all_seq_len_ = 0;
-    // time
-    int64_t prefill_us_ = 0;
-    int64_t decode_us_ = 0;
-protected:
-    void response_init();
-    std::string response_impl(const std::vector<int>& input_ids, std::ostream* os, const char* end_with);
-    VARP embedding(const std::vector<int>& input_ids);
-    VARP txt_embedding(const std::vector<int>& input_ids);
-    int forward(const std::vector<int>& input_ids);
-    std::vector<int> tokenizer_encode(const std::string& input_str);
-    std::string decode(int id);
-protected:
-    VARP inputs_embeds_, attention_mask_, position_ids_;
-    // model configs
-    bool is_single_ = false;
-    bool is_disk_embedding_ = false;
-    bool is_visual_ = false;
-    int layer_nums_ = 0;
-    int hidden_size_ = 4096;
-    std::vector<int> key_value_shape_ = {};
-    std::string model_name_ = "";
-    std::string disk_embedding_file_ = "";
-    // gen info
-    float load_progress_ = 0.f;
-    // tokenizer
-    std::unique_ptr<Tokenizer> tokenizer_;
-    std::shared_ptr<Module> visual_module_;
-private:
-    virtual VARP visual_embedding(const std::vector<int>& input_ids) { return nullptr; }
-    virtual std::vector<int> tokenizer(const std::string& query) = 0;
-    virtual VARP gen_attention_mask(int seq_len) = 0;
-    virtual VARP gen_position_ids(int seq_len) = 0;
-    virtual bool is_stop(int token_id) = 0;
-private:
-    // MNN Modules
-    std::shared_ptr<Executor::RuntimeManager> runtime_manager_;
-    std::vector<std::shared_ptr<Module>> modules_;
-    std::vector<std::shared_ptr<Module>> decode_modules_;
-    std::vector<std::shared_ptr<Module>> prefill_modules_;
-    std::vector<VARP> past_key_values_;
-    // model dir
-    std::string model_dir_;
-    int mForwardType = 0;
-    int mPrecisionMemory = 0;
-};
+}
 
-// some llm models
-class Chatglm_6b : public Llm {
+class rapid_json_wrapper {
 public:
-    Chatglm_6b() {
-        model_name_ = "Chatglm_6b";
-        layer_nums_ = 28;
-        key_value_shape_ = {2, 0, 1, 32, 128};
+    Document document;
+    rapid_json_wrapper() {}
+    rapid_json_wrapper(Document doc) : document(std::move(doc)) {}
+    static rapid_json_wrapper parse(const std::ifstream& ifile) {
+        std::ostringstream ostr;
+        ostr << ifile.rdbuf();
+        Document document;
+        document.Parse(ostr.str().c_str());
+        rapid_json_wrapper json_wrapper(std::move(document));
+        return json_wrapper;
+    }
+    static rapid_json_wrapper parse(const char* str) {
+        Document document;
+        document.Parse(str);
+        rapid_json_wrapper json_wrapper(std::move(document));
+        return json_wrapper;
     }
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-    virtual VARP gen_attention_mask(int seq_len) override;
-    virtual VARP gen_position_ids(int seq_len) override;
-    virtual bool is_stop(int token_id) override;
-    int context_len_ = 0;
-};
 
-class Chatglm2_6b : public Llm {
-public:
-    Chatglm2_6b() {
-        model_name_ = "Chatglm2_6b";
-        layer_nums_ = 28;
-        key_value_shape_ = {2, 0, 1, 2, 128};
+    template <typename T>
+    T value(const char* key, const T& defualt_value) const {
+        if (document.HasMember(key)) {
+            const auto& value = document[key];
+            if constexpr (std::is_same<T, int>::value) {
+                if (value.IsInt()) return value.GetInt();
+            } else if constexpr (std::is_same<T, std::string>::value || std::is_same<T, const char*>::value) {
+                if (value.IsString()) return value.GetString();
+            } else if constexpr (std::is_same<T, bool>::value) {
+                if (value.IsBool()) return value.GetBool();
+            } else if constexpr (std::is_same<T, std::vector<int>>::value) {
+                if (value.IsArray()) {
+                    std::vector<int> result;
+                    for (auto& v : value.GetArray()) {
+                        if (v.IsInt()) {
+                            result.push_back(v.GetInt());
+                        }
+                    }
+                    return result;
+                }
+            }
+        }
+        return defualt_value;
+    }
+    std::string value(const char key[], const char defualt_value[]) const {
+        return value(key, std::string(defualt_value));
     }
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-    virtual VARP gen_attention_mask(int seq_len) override;
-    virtual VARP gen_position_ids(int seq_len) override;
-    virtual bool is_stop(int token_id) override;
 };
 
-class Phi_2 : public Chatglm2_6b {
+class LlmConfig {
 public:
-    Phi_2() {
-        model_name_ = "Phi_2";
-        layer_nums_ = 32;
-        key_value_shape_ = {1, 0, 2, 32, 80};
-        hidden_size_ = 2560;
-        tokenizer_.reset(new Tiktoken);
+    std::string base_dir_;
+    rapid_json_wrapper config_, llm_config_;
+    LlmConfig() {}
+    LlmConfig(const std::string& path) {
+        // load config
+        if (has_suffix(path, ".json")) {
+            std::ifstream config_file(path);
+            if (config_file.is_open()) {
+                config_ = rapid_json_wrapper::parse(config_file);
+            } else {
+                std::cerr << "Unable to open config file: " << path << std::endl;
+            }
+            base_dir_ = base_dir(path);
+        } else {
+            // compatibility with the original usage
+            if (has_suffix(path, ".mnn")) {
+                auto model_name = file_name(path);
+                std::string json_str = R"({
+                    "llm_model": ")" + model_name + R"(",
+                    "llm_weight": ")" + model_name + R"(.weight"
+                })";
+                config_ = rapid_json_wrapper::parse(json_str.c_str());
+                base_dir_ = base_dir(path);
+            } else {
+                const char* json_cstr = "{}";
+                config_ = rapid_json_wrapper::parse(json_cstr);
+                base_dir_ = path;
+            }
+        }
+        // using config's base_dir
+        base_dir_ = config_.value("base_dir", base_dir_);
+        // load llm_config for model info
+        std::ifstream llm_config_file(llm_config());
+        if (llm_config_file.is_open()) {
+            llm_config_ = rapid_json_wrapper::parse(llm_config_file);
+        } else {
+            std::cerr << "Unable to open llm_config file: " << llm_config() << std::endl;
+        }
     }
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-    virtual bool is_stop(int token_id) override;
-};
 
-class Qwen_7b : public Llm {
-public:
-    Qwen_7b() {
-        model_name_ = "Qwen_7b";
-        layer_nums_ = 32;
-        key_value_shape_ = {2, 1, 0, 32, 128};
-        hidden_size_ = 4096;
-        tokenizer_.reset(new Tiktoken);
+    // < model file config start
+    std::string llm_config() const {
+        return base_dir_ + config_.value("llm_config", "llm_config.json");
     }
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-    virtual VARP gen_attention_mask(int seq_len) override;
-    virtual VARP gen_position_ids(int seq_len) override;
-    virtual bool is_stop(int token_id) override;
-};
 
-class Qwen_vl : public Qwen_7b {
-public:
-    Qwen_vl() {
-        model_name_ = "Qwen_vl";
-        is_visual_ = true;
-        layer_nums_ = 32;
-        key_value_shape_ = {2, 1, 0, 32, 128};
-        hidden_size_ = 4096;
-        tokenizer_.reset(new Tiktoken);
+    std::string llm_model() const {
+        return base_dir_ + config_.value("llm_model", "llm.mnn");
     }
-private:
-    const int img_size_ = 448;
-    const int imgpad_len_ = 256;
-    const int img_start_ = 151857;
-    const int img_end_ = 151858;
-    const int img_pad_ = 151859;
-private:
-    std::vector<int> url_encode(const std::string& url);
-    virtual VARP visual_embedding(const std::vector<int>& input_ids) override;
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-    virtual VARP gen_attention_mask(int seq_len) override;
-};
 
-class Qwen_1_8b : public Qwen_7b {
-public:
-    Qwen_1_8b() {
-        model_name_ = "Qwen_1.8b";
-        layer_nums_ = 24;
-        key_value_shape_ = {2, 1, 0, 16, 128};
-        hidden_size_ = 2048;
-        tokenizer_.reset(new Tiktoken);
+    std::string llm_weight() const {
+        return base_dir_ + config_.value("llm_weight", "llm.mnn.weight");
     }
-};
 
-class Llama2_7b : public Llm {
-public:
-    Llama2_7b() {
-        model_name_ = "Llama2_7b";
-        layer_nums_ = 32;
-        key_value_shape_ = {2, 1, 32, 0, 128};
+    std::string block_model(int index) const {
+        return base_dir_ + config_.value("block_model", "block_") + std::to_string(index) + ".mnn";
     }
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-    virtual VARP gen_attention_mask(int seq_len) override;
-    virtual VARP gen_position_ids(int seq_len) override;
-    virtual bool is_stop(int token_id) override;
-};
 
-class MiniCPM_1_2b : public Llama2_7b {
-public:
-    MiniCPM_1_2b() {
-        model_name_ = "MiniCPM_1_2b";
-        layer_nums_ = 52;
-        key_value_shape_ = {2, 1, 8, 0, 64};
-        hidden_size_ = 1536;
+    std::string lm_model() const {
+        return base_dir_ + config_.value("lm_model", "lm.mnn");
     }
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-};
 
-class MiniCPM_2_4b : public Llama2_7b {
-public:
-    MiniCPM_2_4b() {
-        model_name_ = "MiniCPM_1_2b";
-        layer_nums_ = 40;
-        key_value_shape_ = {2, 1, 36, 0, 64};
-        hidden_size_ = 2304;
+    std::string embedding_model() const {
+        return base_dir_ + config_.value("embedding_model", "embedding.mnn");
     }
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-};
 
-class Llama3_8b : public Llama2_7b {
-public:
-    Llama3_8b() {
-        model_name_ = "Llama3_8b";
-        layer_nums_ = 32;
-        key_value_shape_ = {2, 1, 8, 0, 128};
-        hidden_size_ = 4096;
+    std::string embedding_file() const {
+        return base_dir_ + config_.value("embedding_file", "embeddings_bf16.bin");
     }
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-    virtual bool is_stop(int token_id) override;
-};
-class Qwen2 : public Llama2_7b {
-public:
-    Qwen2() {
-        model_name_ = "Qwen2";
-        tokenizer_.reset(new HuggingfaceTokenizer);
+
+    std::string tokenizer_file() const {
+        return base_dir_ + config_.value("tokenizer_file", "tokenizer.txt");
     }
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-    virtual bool is_stop(int token_id) override;
-};
 
-class Qwen2_0_5b : public Qwen2 {
-public:
-    Qwen2_0_5b() {
-        model_name_ = "Qwen2_0.5b";
-        layer_nums_ = 24;
-        key_value_shape_ = {2, 1, 0, 16, 64};
-        hidden_size_ = 1024;
+    std::string visual_model() const {
+        return base_dir_ + config_.value("visual_model", "visual.mnn");
     }
-};
+    // model file config end >
 
-class Qwen2_1_8b : public Qwen2 {
-public:
-    Qwen2_1_8b() {
-        model_name_ = "Qwen2_1.8b";
-        layer_nums_ = 24;
-        key_value_shape_ = {2, 1, 16, 0, 128};
-        hidden_size_ = 2048;
+    // < generate config start
+    int max_new_tokens() const {
+        return config_.value("max_new_tokens", 512);
     }
-};
+    // generate config end >
 
-class Qwen2_4b : public Qwen2 {
-public:
-    Qwen2_4b() {
-        model_name_ = "Qwen2_4b";
-        layer_nums_ = 40;
-        key_value_shape_ = {2, 1, 20, 0, 128};
-        hidden_size_ = 2560;
+    // < backend config start
+    std::string backend_type() const {
+        return config_.value("backend_type", "cpu");
     }
-};
 
-class Qwen2_7b : public Qwen2 {
-public:
-    Qwen2_7b() {
-        model_name_ = "Qwen2_7b";
-        layer_nums_ = 32;
-        key_value_shape_ = {2, 1, 32, 0, 128};
-        hidden_size_ = 4096;
+    int thread_num() const {
+        return config_.value("thread_num", 4);
     }
-};
 
-class TinyLlama : public Llama2_7b {
-public:
-    TinyLlama() {
-        model_name_ = "TinyLlama";
-        layer_nums_ = 22;
-        key_value_shape_ = {2, 1, 4, 0, 64};
+    std::string precision() const {
+        return config_.value("precision", "low");
     }
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-};
 
-class Yi_6b : public Llama2_7b {
-public:
-    Yi_6b() {
-        model_name_ = "Yi_6b";
-        key_value_shape_ = {2, 1, 4, 0, 128};
+    std::string memory() const {
+        return config_.value("memory", "low");
     }
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-    virtual bool is_stop(int token_id) override;
-};
-// Llm end
+    // backend config end >
 
-// Embedding start
-class Embedding {
-public:
-    Embedding() {
-        // default tokenier is Bert
-        tokenizer_.reset(new BertTokenizer);
+    // < llm model config start
+    bool is_single() const {
+        return llm_config_.value("is_single", true);
     }
-    virtual ~Embedding() {
-        module_.reset();
-        runtime_manager_.reset();
+
+    bool is_visual() const {
+        return llm_config_.value("is_visual", false);
     }
-    static Embedding* createEmbedding(const std::string& path, std::string model_type = "auto");
-    static float dist(VARP var0, VARP var1);
-    void load(const std::string& model_dir);
-    VARP embedding(const std::string& txt);
+
+    int hidden_size() const {
+        return llm_config_.value("hidden_size", 4096);
+    }
+
+    int layer_nums() const {
+        return llm_config_.value("layer_nums", 32);
+    }
+
+    std::vector<int> key_value_shape() const {
+        return llm_config_.value("key_value_shape", std::vector<int>{});
+    }
+
+    std::string attention_mask() const {
+        return llm_config_.value("attention_mask", "int");
+    }
+
+    std::string prompt_template() const {
+        return llm_config_.value("prompt_template", "");
+    }
+    // llm model config end >
+};
+
+class MNN_PUBLIC Llm {
+public:
+    Llm(std::shared_ptr<LlmConfig> config) : config_(config) {}
+    virtual ~Llm();
+    static Llm* createLLM(const std::string& config_path);
+    void chat();
+    void trace(bool start);
+    virtual void load();
+    VARP forward(const std::vector<int>& input_ids);
+    int sample(VARP logits, const std::vector<int>& pre_ids);
+    std::string apply_chat_template(const std::string& input_str) const;
+    std::string response(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr);
+    void generate_init();
+    std::string generate(const std::vector<int>& input_ids, std::ostream* os, const char* end_with);
+    std::vector<int> generate(const std::vector<int>& input_ids, int max_new_tokens = -1);
     void print_speed();
-    int dim() { return hidden_size_; }
+    friend class Pipeline;
 public:
-    // time
-    int64_t embedding_us_ = 0;
+    // forward info
     int prompt_len_ = 0;
-protected:
-    std::vector<int> tokenizer_encode(const std::string& input_str);
-protected:
-    // model configs
-    int layer_nums_ = 0;
-    int hidden_size_ = 1024;
-    std::string model_name_ = "";
-    // tokenizer
+    int gen_seq_len_ = 0;
+    int all_seq_len_ = 0;
+    // time
+    int64_t prefill_us_ = 0;
+    int64_t decode_us_ = 0;
+    bool is_single_ = true;
+    std::shared_ptr<LlmConfig> config_;
     std::unique_ptr<Tokenizer> tokenizer_;
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) = 0;
-    virtual VARP gen_attention_mask(int seq_len) = 0;
-    virtual VARP gen_position_ids(int seq_len) = 0;
-private:
-    // MNN Modules
+protected:
+    std::vector<int> key_value_shape_ = {};
+    std::vector<VARP> past_key_values_;
+    VARP inputs_embeds_, attention_mask_, position_ids_;
     std::shared_ptr<Executor::RuntimeManager> runtime_manager_;
-    std::shared_ptr<Module> module_;
-    // model dir
-    std::string model_dir_;
+    std::vector<std::shared_ptr<Module>> modules_;
+    std::vector<std::shared_ptr<Module>> decode_modules_;
+    std::vector<std::shared_ptr<Module>> prefill_modules_;
+    void init_runtime();
+    std::string decode(int id);
+    bool is_stop(int token_id);
+    virtual std::vector<int> tokenizer(const std::string& query);
+    virtual VARP embedding(const std::vector<int>& input_ids);
+    virtual VARP gen_attention_mask(int seq_len);
+    virtual VARP gen_position_ids(int seq_len);
 };
 
-// some embedding models
-class Bge : public Embedding {
+class Lvlm : public Llm {
 public:
-    Bge() {
-        model_name_ = "Bge";
-        layer_nums_ = 24;
-        hidden_size_ = 1024;
+    Lvlm(std::shared_ptr<LlmConfig> config) : Llm(config) {
+        img_size_ = config->llm_config_.value("img_size", img_size_);
+        imgpad_len_ = config->llm_config_.value("imgpad_len", imgpad_len_);
+        img_start_ = config->llm_config_.value("img_start", img_start_);
+        img_end_ = config->llm_config_.value("img_end", img_end_);
+        img_pad_ = config->llm_config_.value("img_pad", img_pad_);
     }
+    ~Lvlm() { visual_module_.reset(); }
+    virtual void load() override;
+private:
+    int img_size_ = 448, imgpad_len_ = 256, img_start_ = 151857, img_end_ = 151858, img_pad_ = 151859;
+    std::shared_ptr<Module> visual_module_;
+    VARP visual_embedding(const std::vector<int>& input_ids);
+    std::vector<int> url_encode(const std::string& url);
+    virtual std::vector<int> tokenizer(const std::string& query) override;
+    virtual VARP embedding(const std::vector<int>& input_ids) override;
+};
+// Llm end
+
+// Embedding start
+class Embedding : public Llm {
+public:
+    Embedding(std::shared_ptr<LlmConfig> config) : Llm(config) {}
+    static Embedding* createEmbedding(const std::string& config_path);
+    static float dist(VARP var0, VARP var1);
+    virtual void load() override;
+    VARP embedding(const std::string& txt);
+    int dim() { return config_->hidden_size(); }
 private:
     virtual std::vector<int> tokenizer(const std::string& query) override;
     virtual VARP gen_attention_mask(int seq_len) override;
     virtual VARP gen_position_ids(int seq_len) override;
 };
-
 // Embedding end
 
 #endif // LLM_hpp
diff --git a/transformers/llm/engine/include/tokenizer.hpp b/transformers/llm/engine/include/tokenizer.hpp
index 7711e0eee..16d10861d 100644
--- a/transformers/llm/engine/include/tokenizer.hpp
+++ b/transformers/llm/engine/include/tokenizer.hpp
@@ -17,19 +17,35 @@
 
 class Tokenizer {
 public:
+    static constexpr int MAGIC_NUMBER = 430;
+    enum TokenizerType {
+        SENTENCEPIECE = 0,
+        TIKTOIKEN = 1,
+        BERT = 2,
+        HUGGINGFACE = 3
+    };
     Tokenizer() = default;
     virtual ~Tokenizer() = default;
-    virtual bool load(const std::string& filename) = 0;
-    virtual std::vector<int> encode(const std::string& str) = 0;
+    static Tokenizer* createTokenizer(const std::string& filename);
+    bool is_stop(int token);
+    std::vector<int> encode(const std::string& str);
     virtual std::string decode(int id) = 0;
+protected:
+    virtual void load_special(std::ifstream& file);
+    virtual bool load_vocab(std::ifstream& file) = 0;
+    virtual void encode(const std::string& str, std::vector<int>& ids) = 0;
+    std::vector<int> special_tokens_;
+    std::vector<int> stop_tokens_;
+    std::vector<int> prefix_tokens_;
 };
 
 class Sentencepiece : public Tokenizer {
 public:
     Sentencepiece() = default;
-    virtual bool load(const std::string& filename) override;
-    virtual std::vector<int> encode(const std::string& str) override;
     virtual std::string decode(int id) override;
+protected:
+    virtual bool load_vocab(std::ifstream& file) override;
+    virtual void encode(const std::string& str, std::vector<int>& ids) override;
 private:
     enum ModelType {
         UNIGRAM = 1,
@@ -76,10 +92,10 @@ class Sentencepiece : public Tokenizer {
 class Tiktoken : public Tokenizer {
 public:
     Tiktoken() = default;
-    virtual bool load(const std::string& filename) override;
-    virtual std::vector<int> encode(const std::string& str) override;
     virtual std::string decode(int id) override;
 protected:
+    virtual bool load_vocab(std::ifstream& file) override;
+    virtual void encode(const std::string& str, std::vector<int>& ids) override;
     std::unordered_map<std::string, int> encoder_;
     std::vector<std::string> decoder_;
 };
@@ -87,7 +103,8 @@ class Tiktoken : public Tokenizer {
 class BertTokenizer : public Tiktoken {
 public:
     BertTokenizer() = default;
-    virtual std::vector<int> encode(const std::string& str) override;
+protected:
+    virtual void encode(const std::string& str, std::vector<int>& ids) override;
 private:
     std::vector<int> word_piece(const std::string& token);
 };
@@ -104,9 +121,10 @@ struct hash_pair_wstring {
 using BPERanks = std::unordered_map<std::pair<std::wstring, std::wstring>, int, hash_pair_wstring>;
 public:
     HuggingfaceTokenizer() = default;
-    virtual bool load(const std::string& filename) override;
-    virtual std::vector<int> encode(const std::string& str) override;
     virtual std::string decode(int id) override;
+protected:
+    virtual bool load_vocab(std::ifstream& file) override;
+    virtual void encode(const std::string& str, std::vector<int>& ids) override;
 private:
     void bpe(const std::wstring& token, const BPERanks& bpe_ranks, std::vector<std::wstring>* result);
     BPERanks bpe_ranks_;
diff --git a/transformers/llm/engine/llm_demo.cpp b/transformers/llm/engine/llm_demo.cpp
index f8eb04593..3c6512661 100644
--- a/transformers/llm/engine/llm_demo.cpp
+++ b/transformers/llm/engine/llm_demo.cpp
@@ -30,7 +30,6 @@ static void trace_prepare(Llm* llm) {
         decode_len += llm->gen_seq_len_;
         prefill_time += llm->prefill_us_;
         decode_time += llm->decode_us_;
-        llm->reset();
     }
     MNN_PRINT("Prepare for resize opt End\n");
     llm->trace(false);
@@ -96,7 +95,6 @@ static int benchmark(Llm* llm, const std::vector<std::string>& prompts) {
         decode_len += llm->gen_seq_len_;
         prefill_time += llm->prefill_us_;
         decode_time += llm->decode_us_;
-        llm->reset();
     }
     float prefill_s = prefill_time / 1e6;
     float decode_s = decode_time / 1e6;
@@ -114,7 +112,6 @@ static int benchmark(Llm* llm, const std::vector<std::string>& prompts) {
 static int ceval(Llm* llm, const std::vector<std::string>& lines, std::string filename) {
     auto csv_data = parse_csv(lines);
     int right = 0, wrong = 0;
-    llm->max_seq_len_ = 512;
     std::vector<std::string> answers;
     for (int i = 1; i < csv_data.size(); i++) {
         const auto& elements = csv_data[i];
@@ -127,7 +124,6 @@ static int ceval(Llm* llm, const std::vector<std::string>& lines, std::string fi
         printf("%s", prompt.c_str());
         printf("## 进度: %d / %lu\n", i, lines.size() - 1);
         auto res = llm->response(prompt.c_str());
-        llm->reset();
         answers.push_back(res);
     }
     {
@@ -175,33 +171,23 @@ static int eval(Llm* llm, std::string prompt_file) {
 
 int main(int argc, const char* argv[]) {
     if (argc < 2) {
-        std::cout << "Usage: " << argv[0] << " model_dir <forwardtype> <preicionmemory> <prompt.txt>" << std::endl;
+        std::cout << "Usage: " << argv[0] << " config.json <prompt.txt>" << std::endl;
         return 0;
     }
-    std::string model_dir = argv[1];
-    int forwardType = 0;
-    if (argc >= 3) {
-        std::istringstream os(argv[2]);
-        os >> forwardType;
-    }
-    int memoryprecision = 10;
-    if (argc >= 4) {
-        std::istringstream os(argv[3]);
-        os >> memoryprecision;
-    }
-    std::cout << "model path is " << model_dir << std::endl;
-    std::unique_ptr<Llm> llm(Llm::createLLM(model_dir, "auto", forwardType, memoryprecision));
+    std::string config_path = argv[1];
+    std::cout << "config path is " << config_path << std::endl;
+    std::unique_ptr<Llm> llm(Llm::createLLM(config_path));
     {
         AUTOTIME;
-        llm->load(model_dir);
+        llm->load();
     }
-    {
+    if (true) {
         AUTOTIME;
         trace_prepare(llm.get());
     }
-    if (argc < 5) {
+    if (argc < 3) {
         llm->chat();
     }
-    std::string prompt_file = argv[4];
+    std::string prompt_file = argv[2];
     return eval(llm.get(), prompt_file);
 }
diff --git a/transformers/llm/engine/src/llm.cpp b/transformers/llm/engine/src/llm.cpp
index 77bf71fa2..86ff80596 100644
--- a/transformers/llm/engine/src/llm.cpp
+++ b/transformers/llm/engine/src/llm.cpp
@@ -16,6 +16,8 @@
 #include "cpp/ExprDebug.hpp"
 #include "llm.hpp"
 #include "tokenizer.hpp"
+// 0: no debug, 1: test op time, 2: print tensor info
+#define DEBUG_MODE 0
 
 #ifdef USING_VISUAL_MODEL
 #include "httplib.h"
@@ -23,87 +25,190 @@
 #endif
 
 // Llm start
-Llm* Llm::createLLM(const std::string& path, std::string model_type, int forwardType, int preicsionmemory) {
-    auto size = path.size();
-
-    // end with '.mnn' is single model file, otherwise split block models
-    bool is_single = (size > 4 &&
-                      path[size - 4] == '.' &&
-                      path[size - 3] == 'm' &&
-                      path[size - 2] == 'n' &&
-                      path[size - 1] == 'n');
+Llm* Llm::createLLM(const std::string& config_path) {
+    std::shared_ptr<LlmConfig> config(new LlmConfig(config_path));
     Llm* llm = nullptr;
-    if (model_type == "auto") {
-        model_type = path;
-    }
-    if (model_type.find("chatglm") != std::string::npos) {
-        if (model_type.find("chatglm2") != std::string::npos) {
-            llm = new Chatglm2_6b;
-        } else if (model_type.find("chatglm3") != std::string::npos) {
-            llm = new Chatglm2_6b;
-            llm->model_name_ = "Chatglm3_6b";
-        } else {
-            llm = new Chatglm_6b;
+    if (config->is_visual()) {
+        llm = new Lvlm(config);
+    } else {
+        llm = new Llm(config);
+    }
+    return llm;
+}
+
+static MNNForwardType backend_type_convert(const std::string& type_str) {
+    if (type_str == "cpu") return MNN_FORWARD_CPU;
+    if (type_str == "metal") return MNN_FORWARD_METAL;
+    if (type_str == "cuda") return MNN_FORWARD_CUDA;
+    if (type_str == "opencl") return MNN_FORWARD_OPENCL;
+    if (type_str == "opengl") return MNN_FORWARD_OPENGL;
+    if (type_str == "vulkan") return MNN_FORWARD_VULKAN;
+    if (type_str == "npu") return MNN_FORWARD_NN;
+    return MNN_FORWARD_AUTO;
+}
+
+void Llm::init_runtime() {
+    ScheduleConfig config;
+    BackendConfig cpuBackendConfig;
+    config.type          = backend_type_convert(config_->backend_type());
+    config.numThread     = config_->thread_num();
+    if (config_->memory() == "low") {
+        cpuBackendConfig.memory = BackendConfig::Memory_Low;
+    }
+    if (config_->precision() == "low") {
+        cpuBackendConfig.precision = BackendConfig::Precision_Low;
+    }
+    config.backendConfig = &cpuBackendConfig;
+    ExecutorScope::Current()->setGlobalExecutorConfig(config.type, cpuBackendConfig, config.numThread);
+
+    runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config));
+    runtime_manager_->setHint(MNN::Interpreter::MEM_ALLOCATOR_TYPE, 0);
+#if DEBUG_MODE==1
+    runtime_manager_->setMode(MNN::Interpreter::Session_Debug);
+    _initTimeTrace();
+#endif
+#if DEBUG_MODE==2
+    runtime_manager_->setMode(MNN::Interpreter::Session_Debug);
+    _initTensorStatic();
+#endif
+    {
+        runtime_manager_->setCache(".tempcache");
+    }
+}
+
+void Llm::load() {
+    init_runtime();
+    // init module status
+    key_value_shape_ = config_->key_value_shape();
+    is_single_ = config_->is_single();
+    {
+        std::ifstream embedding_bin(config_->embedding_file());
+        embedding_bin.close();
+    }
+    MNN_PRINT("### is_single_ = %d\n", is_single_);
+    // 1. load vocab
+    MNN_PRINT("load tokenizer\n");
+    tokenizer_.reset(Tokenizer::createTokenizer(config_->tokenizer_file()));
+    MNN_PRINT("load tokenizer Done\n");
+    // 3. load model
+    Module::Config module_config;
+    module_config.shapeMutable = true;
+    module_config.rearrange = true;
+    int layer_nums = config_->layer_nums();
+    if (is_single_) {
+        // load single model
+        key_value_shape_.insert(key_value_shape_.begin(), layer_nums);
+        modules_.resize(1);
+        std::string model_path = config_->llm_model();
+        MNN_PRINT("load %s ... ", model_path.c_str());
+        runtime_manager_->setExternalFile(config_->llm_weight());
+        modules_[0].reset(Module::load(
+                {"input_ids", "attention_mask", "position_ids", "past_key_values"},
+                {"logits", "presents"}, model_path.c_str(), runtime_manager_, &module_config));
+        MNN_PRINT("Done!\n");
+    } else {
+        // load split models
+        modules_.resize(layer_nums + 2);
+        // load lm model
+        modules_[layer_nums].reset(Module::load({}, {}, config_->lm_model().c_str(), runtime_manager_, &module_config));
+        // load block models
+        for (int i = 0; i < layer_nums; i++) {
+            std::string model_path = config_->block_model(i);
+            MNN_PRINT("load %s ... ", model_path.c_str());
+            modules_[i].reset(Module::load(
+                {"inputs_embeds", "attention_mask", "position_ids", "past_key_values"},
+                {"hidden_states", "presents"}, model_path.c_str(), runtime_manager_, &module_config));
+            MNN_PRINT("Done!\n");
         }
-    } else if (model_type.find("codegeex2") != std::string::npos) {
-        llm = new Chatglm2_6b;
-        llm->model_name_ = "Codegeex2_6b";
-    } else if (model_type.find("qwen1.5") != std::string::npos ||
-               model_type.find("qwen2") != std::string::npos) {
-        if (model_type.find("0.5b") != std::string::npos) {
-            llm = new Qwen2_0_5b;
-        } else if (model_type.find("1.8b") != std::string::npos) {
-            llm = new Qwen2_1_8b;
-        } else if (model_type.find("4b") != std::string::npos) {
-            llm = new Qwen2_4b;
-        } else if (model_type.find("7b") != std::string::npos) {
-            llm = new Qwen2_7b;
+    }
+    decode_modules_.resize(modules_.size());
+    for (int v=0; v<modules_.size(); ++v) {
+        decode_modules_[v].reset(Module::clone(modules_[v].get()));
+    }
+    prefill_modules_ = modules_;
+}
+
+void Llm::trace(bool start) {
+    auto status = MNN::Interpreter::Session_Resize_Check;
+    if (start) {
+        status = MNN::Interpreter::Session_Resize_Check;
+    } else {
+        status = MNN::Interpreter::Session_Resize_Fix;
+    }
+    for (auto& m : decode_modules_) {
+        m->traceOrOptimize(status);
+    }
+    runtime_manager_->updateCache();
+}
+
+VARP Llm::forward(const std::vector<int>& input_ids) {
+    int seq_len = input_ids.size();
+    auto attention_mask = gen_attention_mask(seq_len);
+    auto position_ids = gen_position_ids(seq_len);
+    VARP logits;
+    if (is_single_) {
+        // single model
+        auto hidden_states = embedding(input_ids);
+        auto outputs = modules_.back()->onForward({hidden_states, attention_mask, position_ids, past_key_values_[0]});
+        if (outputs.empty()) {
+            return nullptr;
         }
-    } else if (model_type.find("qwen") != std::string::npos) {
-        if (model_type.find("1.8") != std::string::npos) {
-            llm = new Qwen_1_8b;
-        } else if (model_type.find("vl") != std::string::npos) {
-            llm = new Qwen_vl;
-        } else {
-            llm = new Qwen_7b;
+        ExecutorScope::Current()->gc(Executor::FULL);
+        logits = outputs[0];
+        past_key_values_[0] = outputs[1];
+    } else {
+        // split block models
+        int layer_nums = config_->layer_nums();
+        auto hidden_states = embedding(input_ids);
+        ExecutorScope::Current()->gc(Executor::FULL);
+        for (int i = 0; i < layer_nums; i++) {
+            AUTOTIME;
+            auto outputs = modules_[i]->onForward({hidden_states, attention_mask, position_ids, past_key_values_[i]});
+            hidden_states = outputs[0];
+            past_key_values_[i] = outputs[1];
         }
-    } else if (model_type.find("llama2") != std::string::npos) {
-        llm = new Llama2_7b;
-    } else if (model_type.find("baichuan") != std::string::npos) {
-        llm = new Llama2_7b;
-        llm->model_name_ = "Baichuan2_7b";
-    } else if (model_type.find("phi2") != std::string::npos) {
-        llm = new Phi_2;
-    } else if (model_type.find("internlm") != std::string::npos) {
-        llm = new Llama2_7b;
-        llm->model_name_ = "Internlm_7b";
-    } else if (model_type.find("deepseek") != std::string::npos) {
-        llm = new Llama2_7b;
-        llm->model_name_ = "deepseek_7b";
-        llm->layer_nums_ = 30;
-    } else if (model_type.find("tinyllama") != std::string::npos) {
-        llm = new TinyLlama;
-        llm->model_name_ = "TinyLlama";
-    } else if (model_type.find("yi") != std::string::npos) {
-        llm = new Yi_6b;
-        llm->model_name_ = "Yi_6b";
-    } else if (model_type.find("llama3") != std::string::npos) {
-        llm = new Llama3_8b;
-        llm->model_name_ = "Llama3_8b";
-    } else if (model_type.find("MiniCPM_1_2b") != std::string::npos) {
-        llm = new MiniCPM_1_2b;
-    } else if (model_type.find("MiniCPM_2_4b") != std::string::npos) {
-        llm = new MiniCPM_2_4b;
-    }
-    if (!llm) {
-        std::cerr << "model type can't judge!" << std::endl;
-        return llm;
-    }
-    llm->mForwardType = forwardType;
-    llm->is_single_ = is_single;
-    llm->mPrecisionMemory = preicsionmemory;
-    std::cout << "### model name : "<< llm->model_name_ << std::endl;
-    return llm;
+        ExecutorScope::Current()->gc(Executor::FULL);
+        {
+            AUTOTIME;
+            auto outputs = modules_[layer_nums]->onForward({hidden_states});
+            logits = outputs[0];
+        }
+    }
+    all_seq_len_ += seq_len;
+    gen_seq_len_++;
+    return logits;
+}
+
+int Llm::sample(VARP logits, const std::vector<int>& pre_ids) {
+    auto scores = (float*)(logits->readMap<float>());
+    auto size = logits->getInfo()->size;
+    float max_score = 0;
+    int token_id = 0;
+    // repetition penalty
+    const float repetition_penalty = 1.1;
+    for (auto id : pre_ids) {
+        float score = scores[id];
+        scores[id] = score < 0 ? score * repetition_penalty : score / repetition_penalty;
+    }
+    // argmax
+    for (int i = 0; i < size; i++) {
+        float score = scores[i];
+        if (score > max_score) {
+            max_score = score;
+            token_id = i;
+        }
+    }
+    return token_id;
+}
+
+std::string Llm::apply_chat_template(const std::string& input_str) const {
+    auto prompt = config_->prompt_template();
+    if (prompt.empty()) return input_str;
+    const std::string placeholder = "%s";
+    size_t start_pos = prompt.find(placeholder);
+    if (start_pos == std::string::npos) return input_str;
+    prompt.replace(start_pos, placeholder.length(), input_str);
+    return prompt;
 }
 
 void Llm::chat() {
@@ -115,7 +220,7 @@ void Llm::chat() {
             break;
         }
         if (input_str == "/reset") {
-            reset();
+            // reset();
             std::cout << "\nA: reset done." << std::endl;
             continue;
         }
@@ -123,10 +228,9 @@ void Llm::chat() {
         response(input_str);
         std::cout << std::endl;
     }
-    reset();
 }
 
-void Llm::response_init() {
+void Llm::generate_init() {
     // init status
     gen_seq_len_ = 0;
     all_seq_len_ = 0;
@@ -136,34 +240,72 @@ void Llm::response_init() {
     if (is_single_) {
         past_key_values_.push_back(_Input(key_value_shape_, NCHW));
     } else {
-        for (int i = 0; i < layer_nums_; i++) {
+        for (int i = 0; i < config_->layer_nums(); i++) {
             past_key_values_.push_back(_Input(key_value_shape_, NCHW));
         }
     }
 }
 
-std::string Llm::response_impl(const std::vector<int>& input_ids, std::ostream* os, const char* end_with) {
+std::vector<int> Llm::generate(const std::vector<int>& input_ids, int max_new_tokens) {
+    generate_init();
+    std::vector<int> output_ids, all_ids = input_ids;
     prompt_len_ = static_cast<int>(input_ids.size());
+    if (max_new_tokens < 0) { max_new_tokens = config_->max_new_tokens(); }
+    // prefill
+    auto logits = forward(input_ids);
+    if (logits.get() == nullptr) {
+        return {};
+    }
+    int token = sample(logits, all_ids);
+    output_ids.push_back(token);
+    all_ids.push_back(token);
+    // decode
+    while (gen_seq_len_ < max_new_tokens) {
+        logits = forward({token});
+        if (logits.get() == nullptr) {
+            return {};
+        }
+        token = sample(logits, all_ids);
+        if (is_stop(token)) { break; }
+        output_ids.push_back(token);
+        all_ids.push_back(token);
+    }
+    return output_ids;
+}
+
+std::string Llm::generate(const std::vector<int>& input_ids, std::ostream* os, const char* end_with) {
+    prompt_len_ = static_cast<int>(input_ids.size());
+    std::vector<int> all_ids = input_ids;
     auto st = std::chrono::system_clock::now();
     modules_ = prefill_modules_;
-    int token = forward(input_ids);
+    auto logits = forward(input_ids);
+    if (nullptr == logits.get()) {
+        return "";
+    }
+    int token = sample(logits, all_ids);
+    all_ids.push_back(token);
     auto et = std::chrono::system_clock::now();
-    history_.push_back(token);
+    modules_ = decode_modules_;
     std::string output_str = decode(token);
     prefill_us_ = std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
     *os << output_str << std::flush;
-    modules_ = decode_modules_;
-
-    while (gen_seq_len_ < max_seq_len_) {
+    while (gen_seq_len_ < config_->max_new_tokens()) {
         st = std::chrono::system_clock::now();
-        token = forward({token});
+        logits = forward({token});
+        if (nullptr == logits.get()) {
+            return "";
+        }
+        if (logits->getInfo()->size == 0) {
+            return "";
+        }
+        token = sample(logits, all_ids);
         et = std::chrono::system_clock::now();
         decode_us_ += std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
         if (is_stop(token)) {
             *os << end_with << std::flush;
             break;
         }
-        history_.push_back(token);
+        all_ids.push_back(token);
         auto word = decode(token);
         *os << word << std::flush;
         output_str += word;
@@ -174,30 +316,45 @@ std::string Llm::response_impl(const std::vector<int>& input_ids, std::ostream*
     return output_str;
 }
 
-std::string Llm::response(const std::string& query, std::ostream* os, const char* end_with) {
-    response_init();
-    if (!end_with) {
-        end_with = "\n";
-    }
-    // response
-    auto input_ids = tokenizer(query);
-    if (!history_.empty()) {
-        std::copy(input_ids.begin(), input_ids.end(), std::back_inserter(history_));
-        input_ids = history_;
-    } else {
-        history_ = input_ids;
-    }
-    return response_impl(input_ids, os, end_with);
+std::vector<int> Llm::tokenizer(const std::string& query) {
+    auto prompt = apply_chat_template(query);
+    auto input_ids = tokenizer_->encode(prompt);
+    return input_ids;
 }
 
-std::string Llm::response_nohistory(const std::string& query, std::ostream* os, const char* end_with) {
-    response_init();
-    if (!end_with) {
-        end_with = "\n";
-    }
-    // response
+std::string Llm::response(const std::string& query, std::ostream* os, const char* end_with) {
+    generate_init();
+    if (!end_with) { end_with = "\n"; }
     auto input_ids = tokenizer(query);
-    return response_impl(input_ids, os, end_with);
+    return generate(input_ids, os, end_with);
+}
+Llm::~Llm() {
+#if DEBUG_MODE==1
+    if (nullptr != gTimeTraceInfo) {
+        float opSummer = 0.0f;
+        float opFlopsSummber = 0.0f;
+        for (auto& iter : gTimeTraceInfo->mTypes) {
+            float summer = 0.0f;
+            float summerflops = 0.0f;
+            for (auto& t : iter.second) {
+                for (auto& t0 : t.second) {
+                    summer += t0.first;
+                    summerflops += t0.second;
+                }
+            }
+            summer = summer;
+            summerflops = summerflops;
+            MNN_PRINT("%s : %.7f, FLOP: %.7f, Speed: %.7f GFlops\n", iter.first.c_str(), summer, summerflops, summerflops / summer);
+            opSummer += summer;
+            opFlopsSummber+= summerflops;
+        }
+        MNN_PRINT("OP Summer: %.7f, Flops: %.7f, Speed: %.7f GFlops\n", opSummer, opFlopsSummber, opFlopsSummber/opSummer);
+    }
+#endif
+    decode_modules_.clear();
+    prefill_modules_.clear();
+    modules_.clear();
+    runtime_manager_.reset();
 }
 
 void Llm::print_speed() {
@@ -218,171 +375,6 @@ void Llm::print_speed() {
     printf("##################################\n");
 }
 
-void Llm::reset() {
-    history_.clear();
-}
-
-void Llm::load(const std::string& model_dir) {
-    model_dir_ = model_dir;
-    // init
-    ScheduleConfig config;
-    BackendConfig cpuBackendConfig;
-    config.type          = (MNNForwardType)mForwardType;
-    if (config.type == MNN_FORWARD_OPENCL) {
-        config.numThread = MNN_GPU_MEMORY_BUFFER | MNN_GPU_TUNING_NORMAL;
-    }
-    ExecutorScope::Current()->setGlobalExecutorConfig(MNN_FORWARD_CPU, cpuBackendConfig, config.numThread);
-
-    cpuBackendConfig.precision = (BackendConfig::PrecisionMode)(mPrecisionMemory % 4);
-    cpuBackendConfig.memory = (BackendConfig::MemoryMode)((mPrecisionMemory / 4) % 4);
-    printf("### precision, memory = %d, %d\n", (mPrecisionMemory % 4), ((mPrecisionMemory / 4) % 4));
-    config.backendConfig = &cpuBackendConfig;
-    runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config));
-    runtime_manager_->setHint(MNN::Interpreter::MEM_ALLOCATOR_TYPE, 0);
-//    runtime_manager_->setMode(MNN::Interpreter::Session_Debug);
-//    _initTensorStatic();
-    {
-        runtime_manager_->setCache(".tempcache");
-    }
-    load_progress_ = 0.f;
-    printf("load tokenizer\n");
-    // 1. load vocab
-    std::string tokenizer_path = model_dir + "/tokenizer.txt";
-    if (is_single_) {
-        size_t pos = model_dir.find_last_of("/\\");
-        std::string dir_path = (pos != std::string::npos) ? model_dir.substr(0, pos + 1) : "";
-        model_dir_ = dir_path;
-        tokenizer_path = dir_path + "/tokenizer.txt";
-    }
-    load_progress_ += 5.f;
-    tokenizer_->load(tokenizer_path);
-    load_progress_ += 5.f;
-    printf("load tokenizer Done\n");
-    {
-        disk_embedding_file_ = model_dir_ + "/embeddings_bf16.bin";
-        std::ifstream embedding_bin(disk_embedding_file_);
-        is_disk_embedding_ = embedding_bin.good();
-        MNN_PRINT("### disk embedding is %d\n", is_disk_embedding_);
-        embedding_bin.close();
-    }
-    // 2. load model
-    Module::Config module_config;
-    module_config.shapeMutable = true;
-    module_config.rearrange = true;
-    if (is_single_) {
-        key_value_shape_.insert(key_value_shape_.begin(), layer_nums_);
-        modules_.resize(1);
-        std::string model_path = model_dir;
-        std::string external_path = model_dir + ".weight";
-        MNN_PRINT("load %s ... ", model_path.c_str());
-        runtime_manager_->setExternalFile(external_path);
-        modules_[0].reset(Module::load(
-                {"input_ids", "attention_mask", "position_ids", "past_key_values"},
-                {"token_id", "presents"}, model_path.c_str(), runtime_manager_, &module_config));
-        MNN_PRINT("Done!\n");
-        load_progress_ += 90.f;
-    } else {
-        // 2. load models
-        modules_.resize(layer_nums_ + 2);
-        float step = 90.0 / modules_.size();
-        char buffer[50];
-        // load lm model
-        std::string lm_model_path = model_dir + "/lm.mnn";
-        MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, lm_model_path.c_str());
-        modules_[layer_nums_].reset(Module::load({}, {}, lm_model_path.c_str(), runtime_manager_, &module_config));
-        MNN_PRINT("Done!\n");
-        load_progress_ += step;
-        if (!is_disk_embedding_) {
-            std::string embedding_model_path = model_dir + "/embedding.mnn";
-            MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, embedding_model_path.c_str());fflush(stdout);
-            modules_[layer_nums_ + 1].reset(Module::load({}, {}, embedding_model_path.c_str(), runtime_manager_, &module_config));
-            MNN_PRINT("Done!\n");
-            load_progress_ += step;
-        }
-        if (is_visual_) {
-            std::string visual_model_path = model_dir + "/visual.mnn";
-            MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, visual_model_path.c_str());fflush(stdout);
-            module_config.rearrange = false;
-            visual_module_.reset(Module::load({}, {}, visual_model_path.c_str(), runtime_manager_, &module_config));
-            MNN_PRINT("Done!\n");
-            module_config.rearrange = true;
-        }
-        // load glm_block models
-        for (int i = 0; i < layer_nums_; i++) {
-            load_progress_ += step;
-            std::string model_path = model_dir + "/block_" + std::to_string(i) + ".mnn";
-            MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, model_path.c_str());
-            modules_[i].reset(Module::load(
-                {"inputs_embeds", "attention_mask", "position_ids", "past_key_values"},
-                {"hidden_states", "presents"}, model_path.c_str(), runtime_manager_, &module_config));
-            MNN_PRINT("Done!\n");
-        }
-    }
-    if (config.type == MNN_FORWARD_OPENCL) {
-        // warmup();
-    }
-    decode_modules_.resize(modules_.size());
-    for (int v=0; v<modules_.size(); ++v) {
-        decode_modules_[v].reset(Module::clone(modules_[v].get()));
-//        decode_modules_[v] = modules_[v];
-    }
-    prefill_modules_ = modules_;
-}
-
-void Llm::warmup() {
-    // warmup
-    MNN_PRINT("### warmup ... ");
-    if (is_single_) {
-        past_key_values_.push_back(_Input(key_value_shape_, NCHW));
-    } else {
-        for (int i = 0; i < layer_nums_; i++) {
-            past_key_values_.push_back(_Input(key_value_shape_, NCHW));
-        }
-    }
-    std::vector<int> tmp(1, 0);
-    forward(tmp);
-    all_seq_len_ = 0;
-    gen_seq_len_ = 0;
-    printf("Done\n");
-}
-
-int Llm::forward(const std::vector<int>& input_ids) {
-    int seq_len = input_ids.size();
-    auto attention_mask = gen_attention_mask(seq_len);
-    auto position_ids = gen_position_ids(seq_len);
-    int id = -1;
-    if (is_single_) {
-        // single model
-        auto hidden_states = _Const(input_ids.data(), {seq_len}, NCHW, halide_type_of<int>());
-        if (is_disk_embedding_) {
-            hidden_states = embedding(input_ids);
-        }
-        auto outputs = modules_.back()->onForward({hidden_states, attention_mask, position_ids, past_key_values_[0]});
-        ExecutorScope::Current()->gc(Executor::FULL);
-        id = outputs[0]->readMap<int>()[0];
-        past_key_values_[0] = outputs[1];
-    } else {
-        // split block models
-        auto hidden_states = embedding(input_ids);
-        ExecutorScope::Current()->gc(Executor::FULL);
-        for (int i = 0; i < layer_nums_; i++) {
-            AUTOTIME;
-            auto outputs = modules_[i]->onForward({hidden_states, attention_mask, position_ids, past_key_values_[i]});
-            hidden_states = outputs[0];
-            past_key_values_[i] = outputs[1];
-        }
-        ExecutorScope::Current()->gc(Executor::FULL);
-        {
-            AUTOTIME;
-            auto outputs = modules_[layer_nums_]->onForward({hidden_states});
-            id = outputs[0]->readMap<int>()[0];
-        }
-    }
-    all_seq_len_ += seq_len;
-    gen_seq_len_++;
-    return id;
-}
-
 static inline bool needNewVar(VARP var, int axis, int seq_len) {
     if (var == nullptr) {
         return true;
@@ -393,27 +385,23 @@ static inline bool needNewVar(VARP var, int axis, int seq_len) {
     return false;
 }
 
-VARP Llm::txt_embedding(const std::vector<int>& input_ids) {
-    if (!is_disk_embedding_) {
-        // using model forward
-        auto inputs_ids_ = _Const(input_ids.data(), {static_cast<int>(input_ids.size())}, NCHW, halide_type_of<int>());
-        auto hidden_states = modules_[layer_nums_ + 1]->onForward({inputs_ids_})[0];
-        return hidden_states;
-    }
+VARP Llm::embedding(const std::vector<int>& input_ids) {
     AUTOTIME;
     // disk embedding to save memory
+    int hidden_size = config_->hidden_size();
     int seq_len = static_cast<int>(input_ids.size());
     if (needNewVar(inputs_embeds_, 0, seq_len)) {
-        inputs_embeds_ = _Input({seq_len, 1, hidden_size_}, NCHW);
+        inputs_embeds_ = _Input({seq_len, 1, hidden_size}, NCHW);
     }
-    size_t size = hidden_size_ * sizeof(int16_t);
-    FILE* file = fopen(disk_embedding_file_.c_str(), "rb");
-    std::unique_ptr<int16_t[]> buffer(new int16_t[hidden_size_]);
+
+    size_t size = hidden_size * sizeof(int16_t);
+    FILE* file = fopen(config_->embedding_file().c_str(), "rb");
+    std::unique_ptr<int16_t[]> buffer(new int16_t[hidden_size]);
     for (size_t i = 0; i < seq_len; i++) {
         fseek(file, input_ids[i] * size, SEEK_SET);
         fread(buffer.get(), 1, size, file);
-        auto ptr = inputs_embeds_->writeMap<int16_t>() + i * hidden_size_ * 2;
-        for (int j = 0; j < hidden_size_; j++) {
+        auto ptr = inputs_embeds_->writeMap<int16_t>() + i * hidden_size * 2;
+        for (int j = 0; j < hidden_size; j++) {
             ptr[j * 2] = 0;
             ptr[j * 2 + 1] = buffer[j];
         }
@@ -422,31 +410,6 @@ VARP Llm::txt_embedding(const std::vector<int>& input_ids) {
     return inputs_embeds_;
 }
 
-void Llm::trace(bool start) {
-    auto status = MNN::Interpreter::Session_Resize_Check;
-    if (start) {
-        status = MNN::Interpreter::Session_Resize_Check;
-    } else {
-        status = MNN::Interpreter::Session_Resize_Fix;
-    }
-    for (auto& m : decode_modules_) {
-        m->traceOrOptimize(status);
-    }
-    runtime_manager_->updateCache();
-}
-
-VARP Llm::embedding(const std::vector<int>& input_ids) {
-    if (is_visual_ && !gen_seq_len_) {
-        return visual_embedding(input_ids);
-    }
-    return txt_embedding(input_ids);
-}
-
-std::vector<int> Llm::tokenizer_encode(const std::string& input_str) {
-    auto ids = tokenizer_->encode(input_str);
-    return ids;
-}
-
 std::string Llm::decode(int id) {
     std::string word = tokenizer_->decode(id);
     // Fix utf-8 garbled characters
@@ -457,159 +420,131 @@ std::string Llm::decode(int id) {
     return word;
 }
 
-// Chatglm_6b
-std::vector<int> Chatglm_6b::tokenizer(const std::string& query) {
-    auto ids = tokenizer_encode(query);
-    context_len_ = ids.size();
-    ids.push_back(130001);
-    ids.push_back(130004);
-    return ids;
-}
-
-VARP Chatglm_6b::gen_attention_mask(int seq_len) {
-    auto attention_mask = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of<int>());
-    auto ptr = attention_mask->writeMap<int>();
-    for (int i = 0; i < seq_len * seq_len; i++) {
-        ptr[i] = 0;
-    }
-    if (seq_len > 1) {
-        for (int i = 1; i < seq_len; i++) {
-            ptr[seq_len * i - 1] = 1;
+VARP Llm::gen_attention_mask(int seq_len) {
+    if (config_->attention_mask() == "float") {
+        if (needNewVar(attention_mask_, 2, seq_len)) {
+            attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of<float>());
+        } else {
+            return attention_mask_;
         }
-    }
-    return attention_mask;
-}
-
-VARP Chatglm_6b::gen_position_ids(int seq_len) {
-    auto position_ids = _Input({1, 2, seq_len}, NCHW, halide_type_of<int>());
-    auto ptr = position_ids->writeMap<int>();
-    if (seq_len == 1) {
-        ptr[0] = 1;
-        ptr[1] = all_seq_len_ - context_len_;
-    } else {
-        for (int i = 0; i < seq_len; i++) {
-            ptr[i] = i;
-            ptr[seq_len + i] = 0;
-        }
-        ptr[2 * seq_len - 1] = 1;
-    }
-    return position_ids;
-}
-
-bool Chatglm_6b::is_stop(int token_id) {
-    return token_id == 130005;
-}
-
-// Chatglm2_6b
-std::vector<int> Chatglm2_6b::tokenizer(const std::string& query) {
-    auto prompt = "问：" + query + "\n答：";
-    auto ids = tokenizer_encode(prompt);
-    if (history_.empty()) {
-        ids.insert(ids.begin(), 64792);
-        ids.insert(ids.begin(), 64790);
-    }
-    return ids;
-}
-
-VARP Chatglm2_6b::gen_attention_mask(int seq_len) {
-    auto attention_mask = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of<int>());
-    auto ptr = attention_mask->writeMap<int>();
-    if (seq_len > 1) {
+        auto ptr = attention_mask_->writeMap<float>();
         for (int i = 0; i < seq_len; i++) {
             for (int j = 0; j < seq_len; j++) {
-                ptr[seq_len * i + j] = j > i;
+                ptr[seq_len * i + j] = (j > i) * std::numeric_limits<float>::lowest();
             }
         }
+        return attention_mask_;
     } else {
-        ptr[0] = 0;
-    }
-    return attention_mask;
-}
-
-VARP Chatglm2_6b::gen_position_ids(int seq_len) {
-    auto position_ids = _Input({seq_len}, NCHW, halide_type_of<int>());
-    auto ptr = position_ids->writeMap<int>();
-    if (seq_len == 1) {
-        ptr[0] = gen_seq_len_;
-    } else {
-        for (int i = 0; i < seq_len; i++) {
-            ptr[i] = i;
+        if (needNewVar(attention_mask_, 2, seq_len)) {
+            attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of<int>());
+        } else {
+            return attention_mask_;
         }
+        auto ptr = attention_mask_->writeMap<int>();
+        if (config_->attention_mask() == "glm") {
+            // chatglm
+            for (int i = 0; i < seq_len * seq_len; i++) {
+                ptr[i] = 0;
+            }
+            if (seq_len > 1) {
+                for (int i = 1; i < seq_len; i++) {
+                    ptr[seq_len * i - 1] = 1;
+                }
+            }
+        } else {
+            bool is_glm2 = config_->attention_mask() == "glm2";
+            for (int i = 0; i < seq_len; i++) {
+                for (int j = 0; j < seq_len; j++) {
+                    ptr[seq_len * i + j] = is_glm2 ? j > i : j <= i;
+                }
+            }
+        }
+        return attention_mask_;
     }
-    return position_ids;
-}
-
-bool Chatglm2_6b::is_stop(int token_id) {
-    return token_id <= 2;
-}
-
-// Phi_2
-std::vector<int> Phi_2::tokenizer(const std::string& query) {
-    auto prompt = query;
-    auto ids = tokenizer_encode(prompt);
-    return ids;
 }
 
-bool Phi_2::is_stop(int token_id) {
-    return token_id == 50256;
-}
-
-// Qwen_7b
-std::vector<int> Qwen_7b::tokenizer(const std::string& query) {
-    auto ids = tokenizer_encode(query);
-    // auto prompt = "\n<|im_start|>user\n" + query + "<|im_end|>\n<|im_start|>assistant\n";
-    ids.insert(ids.begin(), {198, 151644, 872, 198});
-    ids.insert(ids.end(), {151645, 198, 151644, 77091, 198});
-    return ids;
-}
-
-VARP Qwen_7b::gen_attention_mask(int seq_len) {
-    if (needNewVar(attention_mask_, 2, seq_len)) {
-        attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of<int>());
+VARP Llm::gen_position_ids(int seq_len) {
+    if (config_->attention_mask() == "glm") {
+        // chatglm
+        if (needNewVar(position_ids_, 2, seq_len)) {
+            position_ids_ = _Input({1, 2, seq_len}, NCHW, halide_type_of<int>());
+        }
+        auto ptr = position_ids_->writeMap<int>();
+        if (seq_len == 1) {
+            ptr[0] = all_seq_len_ - gen_seq_len_ - 2;
+            ptr[1] = gen_seq_len_ + 1;
+        } else {
+            for (int i = 0; i < seq_len - 1; i++) {
+                ptr[i] = i;
+                ptr[seq_len + i] = 0;
+            }
+            ptr[seq_len - 1] = seq_len - 2;
+            ptr[2 * seq_len - 1] = 1;
+        }
+        return position_ids_;
     } else {
-        return attention_mask_;
-    }
-    auto ptr = attention_mask_->writeMap<int>();
-    for (int i = 0; i < seq_len; i++) {
-        for (int j = 0; j < seq_len; j++) {
-            ptr[seq_len * i + j] = j <= i;
+        bool is_glm2 = config_->attention_mask() == "glm2";
+        if (needNewVar(position_ids_, 0, seq_len)) {
+            position_ids_ = _Input({seq_len}, NCHW, halide_type_of<int>());
         }
+        auto ptr = position_ids_->writeMap<int>();
+        if (seq_len == 1) {
+            ptr[0] = is_glm2 ? gen_seq_len_ : all_seq_len_;
+        } else {
+            for (int i = 0; i < seq_len; i++) {
+                ptr[i] = i;
+            }
+        }
+        return position_ids_;
     }
-    return attention_mask_;
 }
 
-VARP Qwen_7b::gen_position_ids(int seq_len) {
-    if (needNewVar(position_ids_, 0, seq_len)) {
-        position_ids_ = _Input({seq_len}, NCHW, halide_type_of<int>());
-    }
-    auto ptr = position_ids_->writeMap<int>();
-    if (seq_len == 1) {
-        ptr[0] = all_seq_len_;
-    } else {
-        for (int i = 0; i < seq_len; i++) {
-            ptr[i] = i;
-        }
-    }
-    return position_ids_;
+bool Llm::is_stop(int token_id) {
+    return tokenizer_->is_stop(token_id);
 }
 
-bool Qwen_7b::is_stop(int token_id) {
-    // <|endoftext|>  <|im_end|>
-    return token_id == 151643 || token_id == 151645;
+void Lvlm::load() {
+    Llm::load();
+    Module::Config module_config;
+    module_config.shapeMutable = true;
+    module_config.rearrange = false;
+    visual_module_.reset(Module::load({}, {}, config_->visual_model().c_str(), runtime_manager_, &module_config));
 }
 
-// Qwen_vl
-std::vector<int> Qwen_vl::url_encode(const std::string& url) {
-    std::vector<int> ascii_values(imgpad_len_, img_pad_);
+std::vector<int> Lvlm::url_encode(const std::string& url) {
+    std::vector<int> ascii_values(imgpad_len_ + 2, img_pad_);
     ascii_values[0] = img_start_;
-    ascii_values[imgpad_len_ - 1] = img_end_;
+    ascii_values[imgpad_len_ + 1] = img_end_;
     for (int i = 0; i < url.size(); i++) {
         ascii_values[i + 1] = static_cast<int>(url[i]);
     }
     return ascii_values;
 }
 
-VARP Qwen_vl::visual_embedding(const std::vector<int>& input_ids) {
+std::vector<int> Lvlm::tokenizer(const std::string& query) {
+    auto prompt = apply_chat_template(query);
+    // split query
+    std::regex img_regex("<img>(.*?)</img>");
+    std::string::const_iterator searchStart(prompt.cbegin());
+    std::smatch match;
+    std::vector<std::string> img_info, txt_info;
+    std::vector<int> ids {};
+    while (std::regex_search(searchStart, prompt.cend(), match, img_regex)) {
+        std::cout << match[1].str() << std::endl;
+        auto txt_ids = tokenizer_->encode(match.prefix().str());
+        ids.insert(ids.end(), txt_ids.begin(), txt_ids.end());
+        auto img_ids = url_encode(match[1].str());
+        ids.insert(ids.end(), img_ids.begin(), img_ids.end());
+        searchStart = match.suffix().first;
+    }
+    if (searchStart != prompt.cend()) {
+        auto txt_ids = tokenizer_->encode(std::string(searchStart, prompt.cend()));
+        ids.insert(ids.end(), txt_ids.begin(), txt_ids.end());
+    }
+    return ids;
+}
+
+VARP Lvlm::embedding(const std::vector<int>& input_ids) {
 #ifdef USING_VISUAL_MODEL
     int start_pos = 0, pad_pos = 0, end_pos = 0;
     for (int i = 0; i < input_ids.size(); i++) {
@@ -625,11 +560,11 @@ VARP Qwen_vl::visual_embedding(const std::vector<int>& input_ids) {
         }
     }
     if (!start_pos) {
-        return txt_embedding(input_ids);
+        return Llm::embedding(input_ids);
     }
-    std::vector<int> prefix(input_ids.begin(), input_ids.begin() + start_pos);
+    std::vector<int> prefix(input_ids.begin(), input_ids.begin() + start_pos + 1);
     std::vector<int> img_ascii(input_ids.begin() + start_pos + 1, input_ids.begin() + pad_pos);
-    std::vector<int> suffix(input_ids.begin() + end_pos + 1, input_ids.end());
+    std::vector<int> suffix(input_ids.begin() + end_pos, input_ids.end());
     std::string img_path;
     for (auto ascii_val : img_ascii) {
         img_path += static_cast<char>(ascii_val);
@@ -671,192 +606,14 @@ VARP Qwen_vl::visual_embedding(const std::vector<int>& input_ids) {
     image = MNN::Express::_Convert(image, NC4HW4);
     auto image_embedding = visual_module_->forward(image);
     image_embedding = MNN::Express::_Permute(image_embedding, {1, 0, 2});
-    auto prefix_embedding = txt_embedding(prefix);
-    auto suffix_embedding = txt_embedding(suffix);
+    auto prefix_embedding = Llm::embedding(prefix);
+    auto suffix_embedding = Llm::embedding(suffix);
     auto embeddings = MNN::Express::_Concat({prefix_embedding, image_embedding, suffix_embedding}, 0);
 #else
-    auto embeddings = txt_embedding(input_ids);
+    auto embeddings = Llm::embedding(input_ids);
 #endif
     return embeddings;
 }
-
-std::vector<int> Qwen_vl::tokenizer(const std::string& query) {
-    // split query
-    std::regex img_regex("<img>(.*?)</img>");
-    std::string::const_iterator searchStart(query.cbegin());
-    std::smatch match;
-    std::vector<std::string> img_info, txt_info;
-    std::vector<int> ids {};
-    while (std::regex_search(searchStart, query.cend(), match, img_regex)) {
-        auto txt_ids = tokenizer_encode(match.prefix().str());
-        ids.insert(ids.end(), txt_ids.begin(), txt_ids.end());
-        auto img_ids = url_encode(match[1].str());
-        ids.insert(ids.end(), img_ids.begin(), img_ids.end());
-        searchStart = match.suffix().first;
-    }
-    if (searchStart != query.cend()) {
-        auto txt_ids = tokenizer_encode(std::string(searchStart, query.cend()));
-        ids.insert(ids.end(), txt_ids.begin(), txt_ids.end());
-    }
-    // auto prompt = "\n<|im_start|>user\n" + query + "<|im_end|>\n<|im_start|>assistant\n";
-    ids.insert(ids.begin(), {198, 151644, 872, 198});
-    ids.insert(ids.end(), {151645, 198, 151644, 77091, 198});
-    return ids;
-}
-
-VARP Qwen_vl::gen_attention_mask(int seq_len) {
-    if (seq_len == 1) {
-        auto attention_mask = _Input({1, 1, 1, all_seq_len_ + 1}, NCHW, halide_type_of<float>());
-        auto ptr = attention_mask->writeMap<float>();
-        for (int i = 0; i < all_seq_len_ + 1; i++) {
-            ptr[i] = 0;
-        }
-        return attention_mask;
-    } else {
-        auto attention_mask = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of<float>());
-        auto ptr = attention_mask->writeMap<float>();
-        for (int i = 0; i < seq_len; i++) {
-            for (int j = 0; j < seq_len; j++) {
-                ptr[seq_len * i + j] = (j > i) * std::numeric_limits<float>::lowest();
-            }
-        }
-        return attention_mask;
-    }
-}
-
-// Llama2_7b
-std::vector<int> Llama2_7b::tokenizer(const std::string& query) {
-    auto ids = tokenizer_encode(query);
-    if (model_name_ == "Baichuan2_7b") {
-        // baichuan2: <reserved_106>{query}<reserved_107>: 195, query, 196
-        ids.insert(ids.begin(), 195);
-        ids.push_back(196);
-        return ids;
-    }
-    if (model_name_ == "Internlm_7b") {
-        // internlm: "<|User|>:" + query + "<eoh>\n<|Bot|>:";
-        // 1, 333, 352, 1621, 352, 27232, query, 103027, 364, 333, 352, 23845, 352, 27232
-        ids.insert(ids.begin(), {1, 333, 352, 1621, 352, 27232});
-        ids.insert(ids.end(), {103027, 364, 333, 352, 23845, 352, 27232});
-        return ids;
-    }
-    if (model_name_ == "deepseek_7b") {
-        // "<|begin▁of▁sentence|>User:" + query + "\n\nAssistant:"
-        ids.insert(ids.begin(), {100000, 5726, 25, 207});
-        ids.insert(ids.end(), {185, 185, 77398, 25});
-        return ids;
-    }
-    // llama2: <bos>[INST]{query}[/INST]: 1, 5539, 25580, 29962, query, 12452, 25580, 29962
-    ids.insert(ids.begin(), {1, 5539, 25580, 29962});
-    ids.insert(ids.end(), {12452, 25580, 29962});
-    return ids;
-}
-
-VARP Llama2_7b::gen_attention_mask(int seq_len) {
-    if (needNewVar(attention_mask_, 2, seq_len)) {
-        attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of<float>());
-    } else {
-        return attention_mask_;
-    }
-    auto ptr = attention_mask_->writeMap<float>();
-    for (int i = 0; i < seq_len; i++) {
-        for (int j = 0; j < seq_len; j++) {
-            ptr[seq_len * i + j] = (j > i) * std::numeric_limits<float>::lowest();
-        }
-    }
-    return attention_mask_;
-}
-
-VARP Llama2_7b::gen_position_ids(int seq_len) {
-    if (needNewVar(position_ids_, 1, seq_len)) {
-        position_ids_ = _Input({1, seq_len}, NCHW, halide_type_of<int>());
-    }
-    auto ptr = position_ids_->writeMap<int>();
-    if (seq_len == 1) {
-        ptr[0] = all_seq_len_;
-    } else {
-        for (int i = 0; i < seq_len; i++) {
-            ptr[i] = i;
-        }
-    }
-    return position_ids_;
-}
-
-bool Llama2_7b::is_stop(int token_id) {
-    if (model_name_ == "Internlm_7b") {
-        // 103028: <eoa>
-        return token_id == 2 || token_id == 103028;
-    }
-    if (model_name_ == "deepseek_7b") {
-        return token_id == 100001;
-    }
-    return token_id == 2;
-}
-
-std::vector<int> MiniCPM_1_2b::tokenizer(const std::string& query) {
-    auto ids = tokenizer_encode(query);
-    // auto prompt = "<用户>" + query + "<AI>";
-    ids.insert(ids.begin(), {59396, 4194, 59388});
-    ids.insert(ids.end(), {59396, 10850, 59388});
-    return ids;
-}
-
-std::vector<int> MiniCPM_2_4b::tokenizer(const std::string& query) {
-    auto ids = tokenizer_encode(query);
-    // auto prompt = "<用户>" + query + "<AI>";
-    ids.insert(ids.begin(), {95396, 4194, 95388});
-    ids.insert(ids.end(), {95396, 10850, 95388});
-    return ids;
-}
-
-std::vector<int> Qwen2::tokenizer(const std::string& query) {
-    auto ids = tokenizer_encode(query);
-    // auto prompt = "<|im_start|>user\n" + query + "<|im_end|>\n<|im_start|>assistant\n";
-    ids.insert(ids.begin(), {151644, 872, 198});
-    ids.insert(ids.end(), {151645, 198, 151644, 77091, 198});
-    return ids;
-}
-
-bool Qwen2::is_stop(int token_id) {
-    return token_id == 151645 || token_id == 151643;
-}
-
-std::vector<int> TinyLlama::tokenizer(const std::string& query) {
-    auto ids = tokenizer_encode(query);
-    /*
-    <|system|>
-    You are a friendly chatbot who always responds in the style of a pirate</s>
-    <|user|>
-    {query}</s>
-    <|assistant|>
-    */
-    ids.insert(ids.begin(), {1, 529, 29989, 5205, 29989, 29958, 13, 3492, 526, 263, 19780, 13563,
-                             7451, 1058, 2337, 10049, 29879, 297, 278, 3114, 310, 263, 21625,
-                             403, 2, 29871, 13, 29966, 29989, 1792, 29989, 29958, 13});
-    ids.insert(ids.end(), {2, 29871, 13, 29966, 29989, 465, 22137, 29989, 29958, 13});
-    return ids;
-}
-
-std::vector<int> Yi_6b::tokenizer(const std::string& query) {
-    auto prompt = "<|im_start|> user\n" + query + "<|im_end|>\n<|im_start|> assistant\n";
-    auto ids = tokenizer_encode(prompt);
-    return ids;
-}
-
-bool Yi_6b::is_stop(int token_id) {
-    return token_id == 7 || token_id == 64001;
-}
-std::vector<int> Llama3_8b::tokenizer(const std::string& query) {
-    // <|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n+query+<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n
-    auto ids = tokenizer_encode(query);
-    ids.insert(ids.begin(), {128000, 128006, 882, 128007, 271});
-    ids.insert(ids.end(), {128009, 128006, 78191, 128007, 271});
-    return ids;
-}
-
-bool Llama3_8b::is_stop(int token_id) {
-    return token_id == 128001 || token_id == 128009;
-}
 // Llm end
 
 // Embedding start
@@ -866,54 +623,28 @@ float Embedding::dist(VARP var0, VARP var1) {
     return dist;
 }
 
-Embedding* Embedding::createEmbedding(const std::string& path, std::string model_type) {
-    auto size = path.size();
-
-    Embedding* embedding = nullptr;
-    if (model_type == "auto") {
-        model_type = path;
-    }
-    if (model_type.find("bge") != std::string::npos) {
-        embedding = new Bge;
-    }
-    if (!embedding) {
-        std::cerr << "model type can't judge!" << std::endl;
-        return embedding;
-    }
-    std::cout << "### model name : "<< embedding->model_name_ << std::endl;
-    embedding->load(path);
+Embedding* Embedding::createEmbedding(const std::string& config_path) {
+    std::shared_ptr<LlmConfig> config(new LlmConfig(config_path));
+    Embedding* embedding = new Embedding(config);
+    embedding->load();
     return embedding;
 }
 
-void Embedding::load(const std::string& model_dir) {
-    if (model_dir_ == model_dir) {
-        return;
-    }
-    model_dir_ = model_dir;
-    // init
-    ScheduleConfig config;
-    BackendConfig cpuBackendConfig;
-    config.type          = MNN_FORWARD_CPU;
-    // config.type          = MNN_FORWARD_OPENCL;
-    config.numThread     = 4;
-    cpuBackendConfig.precision = BackendConfig::Precision_Low;
-    cpuBackendConfig.memory = BackendConfig::Memory_Low;
-    config.backendConfig = &cpuBackendConfig;
-    runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config));
+void Embedding::load() {
+    init_runtime();
     printf("load tokenizer\n");
+    std::cout << config_->tokenizer_file() << std::endl;
     // 1. load vocab
-    size_t pos = model_dir.find_last_of("/\\");
-    std::string dir_path = (pos != std::string::npos) ? model_dir.substr(0, pos + 1) : "";
-    std::string tokenizer_path = dir_path + "/tokenizer.txt";
-    tokenizer_->load(tokenizer_path);
+    tokenizer_.reset(Tokenizer::createTokenizer(config_->tokenizer_file()));
     printf("load tokenizer Done\n");
     // 2. load model
     Module::Config module_config;
     module_config.shapeMutable = true;
     module_config.rearrange = true;
-    std::string model_path = model_dir;
+    auto model_path = config_->llm_model();
     MNN_PRINT("load %s ... ", model_path.c_str());
-    module_.reset(Module::load(
+    modules_.resize(1);
+    modules_[0].reset(Module::load(
             {"input_ids", "attention_mask", "position_ids"},
             {"sentence_embeddings"}, model_path.c_str(), runtime_manager_, &module_config));
     MNN_PRINT("Done!\n");
@@ -921,45 +652,26 @@ void Embedding::load(const std::string& model_dir) {
 
 VARP Embedding::embedding(const std::string& txt) {
     auto ids = tokenizer(txt);
-    prompt_len_ = ids.size();
-    auto inputs_ids = _Const(ids.data(), {prompt_len_}, NCHW, halide_type_of<int>());
-    auto attention_mask = gen_attention_mask(prompt_len_);
-    auto position_ids = gen_position_ids(prompt_len_);
-    auto st = std::chrono::system_clock::now();
-    auto outputs = module_->onForward({inputs_ids, attention_mask, position_ids});
-    auto et = std::chrono::system_clock::now();
-    embedding_us_ = std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
+    int prompt_len = ids.size();
+    auto inputs_ids = _Const(ids.data(), {prompt_len}, NCHW, halide_type_of<int>());
+    auto attention_mask = gen_attention_mask(prompt_len);
+    auto position_ids = gen_position_ids(prompt_len);
+    auto outputs = modules_[0]->onForward({inputs_ids, attention_mask, position_ids});
     auto sentence_embeddings = outputs[0];
-    // print_speed();
     return sentence_embeddings;
 }
 
-void Embedding::print_speed() {
-    auto total_s = embedding_us_ * 1e-6;
-    printf("\n#################################\n");
-    printf("  total token = %d\n", prompt_len_);
-    printf("  total time  = %.2f s\n", total_s);
-    printf("  total speed = %.2f tok/s\n", prompt_len_ / total_s);
-    printf("##################################\n");
-}
-
-std::vector<int> Embedding::tokenizer_encode(const std::string& input_str) {
-    auto ids = tokenizer_->encode(input_str);
-    return ids;
-}
-
-std::vector<int> Bge::tokenizer(const std::string& query) {
+std::vector<int> Embedding::tokenizer(const std::string& query) {
     auto prompt = query;
     if (query.size() <= 256) {
         prompt = "为这个句子生成表示以用于检索相关文章：" + query;
     }
-    auto ids = tokenizer_encode(prompt);
-    ids.insert(ids.begin(), 101);
-    ids.push_back(102);
+    prompt = apply_chat_template(prompt);
+    auto ids = tokenizer_->encode(prompt);
     return ids;
 }
 
-VARP Bge::gen_attention_mask(int seq_len) {
+VARP Embedding::gen_attention_mask(int seq_len) {
     auto attention_mask = _Input({1, 1, 1, seq_len}, NCHW, halide_type_of<int>());
     auto ptr = attention_mask->writeMap<int>();
     for (int i = 0; i < seq_len; i++) {
@@ -968,7 +680,7 @@ VARP Bge::gen_attention_mask(int seq_len) {
     return attention_mask;
 }
 
-VARP Bge::gen_position_ids(int seq_len) {
+VARP Embedding::gen_position_ids(int seq_len) {
     auto position_ids = _Input({1, seq_len}, NCHW, halide_type_of<int>());
     auto ptr = position_ids->writeMap<int>();
     for (int i = 0; i < seq_len; i++) {
diff --git a/transformers/llm/engine/src/tokenizer.cpp b/transformers/llm/engine/src/tokenizer.cpp
index fb71443f0..d6f6490c8 100644
--- a/transformers/llm/engine/src/tokenizer.cpp
+++ b/transformers/llm/engine/src/tokenizer.cpp
@@ -78,18 +78,131 @@ static inline void to_lower_case(std::string& str) {
     }
 }
 
-bool Sentencepiece::load(const std::string& filename) {
+Tokenizer* Tokenizer::createTokenizer(const std::string& filename) {
+    Tokenizer* tokenizer = nullptr;
+    // check file
     std::ifstream tok_file(filename);
+    if (!tok_file.good()) {
+        printf("Failed: can't load tokenzier from: %s.\n", filename.c_str());
+        return tokenizer;
+    }
+    // check tokenizer info
+    std::string line;
+    std::getline(tok_file, line);
+    std::istringstream line_str(line);
+    int magic_number, tokenizer_type;
+    line_str >> magic_number;
+    if (magic_number != MAGIC_NUMBER) {
+        printf("Failed: magic number is wrong from: %s.\n", filename.c_str());
+        return tokenizer;
+    }
+    line_str >> tokenizer_type;
+    printf("tokenizer_type = %d\n", tokenizer_type);
+    // create tokenizer
+    switch (tokenizer_type)
+    {
+        case SENTENCEPIECE:
+            tokenizer = new Sentencepiece();
+            break;
+        case TIKTOIKEN:
+            tokenizer = new Tiktoken();
+            break;
+        case BERT:
+            tokenizer = new BertTokenizer();
+            break;
+        case HUGGINGFACE:
+            tokenizer = new HuggingfaceTokenizer();
+            break;
+        default:
+            return tokenizer;
+    }
+    // load special tokens
+    tokenizer->load_special(tok_file);
+    // load vocabs
+    tokenizer->load_vocab(tok_file);
+    tok_file.close();
+    return tokenizer;
+}
+
+bool Tokenizer::is_stop(int token) {
+    return std::find(stop_tokens_.begin(), stop_tokens_.end(), token) != stop_tokens_.end();
+}
+
+void Tokenizer::load_special(std::ifstream& tok_file) {
+    std::string line;
+    std::getline(tok_file, line);
+    std::istringstream line_str(line);
+    int special_num, stop_num, prefix_num;
+    line_str >> special_num >> stop_num >> prefix_num;
+    std::getline(tok_file, line);
+    std::istringstream specail_line(line);
+    if (special_num) {
+        // load special tokens
+        special_tokens_.resize(special_num);
+        for (int i = 0; i < special_num; i++) {
+            specail_line >> special_tokens_[i];
+        }
+    }
+    if (stop_num) {
+        // load stop tokens
+        stop_tokens_.resize(stop_num);
+        for (int i = 0; i < stop_num; i++) {
+            specail_line >> stop_tokens_[i];
+        }
+    }
+    if (prefix_num) {
+        // load prefix tokens
+        prefix_tokens_.resize(prefix_num);
+        for (int i = 0; i < prefix_num; i++) {
+            specail_line >> prefix_tokens_[i];
+        }
+    }
+}
+
+std::vector<int> Tokenizer::encode(const std::string& str) {
+    std::vector<int> ids = prefix_tokens_;
+    if (!special_tokens_.empty()) {
+        std::string text = str;
+        size_t start = 0;
+        for (size_t i = 0; i < text.length(); ++i) {
+            for (auto special_id : special_tokens_) {
+                const auto& token = decode(special_id);
+                if (token.empty()) continue;
+                if (i + token.length() <= text.length() && text.substr(i, token.length()) == token) {
+                    if (i > start) {
+                        encode(text.substr(start, i - start), ids);
+                    }
+                    ids.push_back(special_id);
+                    start = i + token.length();
+                    i = start - 1;
+                    break;
+                }
+            }
+        }
+        if (start < text.length()) {
+            encode(text.substr(start), ids);
+        }
+    } else {
+        encode(str, ids);
+    }
+    return ids;
+}
+
+bool Sentencepiece::load_vocab(std::ifstream& tok_file) {
     std::string line, token;
+    std::getline(tok_file, line);
+    int vocab_len = std::stoi(line);
     float score;
-    int index = 0, type;
-    while (std::getline(tok_file, line)) {
+    int type;
+    sentence_pieces_.resize(vocab_len);
+    for (int index = 0; index < vocab_len; index++) {
+        std::getline(tok_file, line);
         std::istringstream line_str(line);
         line_str >> token >> score >> type;
         token = base64_decode(token);
         auto piece_type = static_cast<PieceType>(type);
         SentencePiece piece {token, score, piece_type};
-        sentence_pieces_.emplace_back(std::move(piece));
+        sentence_pieces_[index] = std::move(piece);
         if (piece_type == PieceType::NORMAL) {
             pieces_.insert({token, index});
         } else {
@@ -98,9 +211,7 @@ bool Sentencepiece::load(const std::string& filename) {
                 unk_id_ = index;
             }
         }
-        index++;
     }
-    tok_file.close();
     return true;
 }
 
@@ -270,8 +381,7 @@ Sentencepiece::EncodeResult Sentencepiece::bpe_encode(std::string_view normalize
     return output;
 }
 
-std::vector<int> Sentencepiece::encode(const std::string& str) {
-    std::vector<int> ids;
+void Sentencepiece::encode(const std::string& str, std::vector<int>& ids) {
     auto result = bpe_encode(str);
     size_t consumed = 0;
     for (const auto &p : result) {
@@ -291,7 +401,6 @@ std::vector<int> Sentencepiece::encode(const std::string& str) {
             ids.push_back(id);
         }
     }
-    return ids;
 }
 
 std::string Sentencepiece::decode(int id) {
@@ -315,26 +424,24 @@ bool Sentencepiece::is_control(int id) const {
     return sentence_pieces_[id].type == PieceType::CONTROL;
 }
 
-bool Tiktoken::load(const std::string& filename) {
-    std::ifstream tok_file(filename);
-    if (!tok_file.good()) {
-        printf("Failed: can't load tokenzier from: %s.\n", filename.c_str());
-        return false;
-    }
-    std::string token;
-    while (tok_file >> token) {
-        token = base64_decode(token);
-        encoder_[token] = static_cast<int>(decoder_.size());
-        decoder_.push_back(token);
+bool Tiktoken::load_vocab(std::ifstream& tok_file) {
+    std::string line;
+    std::getline(tok_file, line);
+    int vocab_len = std::stoi(line);
+    // load vocab
+    decoder_.resize(vocab_len);
+    for (int i = 0; i < vocab_len; i++) {
+        std::getline(tok_file, line);
+        auto token = base64_decode(line);
+        encoder_.insert({token, i});
+        decoder_[i] = token;
     }
-    tok_file.close();
     return true;
 }
 
-std::vector<int> Tiktoken::encode(const std::string& str) {
-    std::vector<int> ids;
+void Tiktoken::encode(const std::string& str, std::vector<int>& ids) {
     if (str.empty()) {
-        return ids;
+        return;
     }
     size_t i = 0;
     while (i < str.size()) {
@@ -362,10 +469,9 @@ std::vector<int> Tiktoken::encode(const std::string& str) {
             // If no matching symbol is found, this typically means an error in the encoding
             // or the input text contains characters that the encoder doesn't know how to handle
             std::cerr << "Error: No encoding found for the sequence starting at position " << i << std::endl;
-            return {};
+            return;
         }
     }
-    return ids;
 }
 
 std::string Tiktoken::decode(int id) {
@@ -409,8 +515,7 @@ std::vector<int> BertTokenizer::word_piece(const std::string& token) {
     return ids;
 }
 
-std::vector<int> BertTokenizer::encode(const std::string& str) {
-    std::vector<int> ids;
+void BertTokenizer::encode(const std::string& str, std::vector<int>& ids) {
     std::vector<std::string> tokens;
     std::string current_token;
     size_t i = 0;
@@ -460,7 +565,6 @@ std::vector<int> BertTokenizer::encode(const std::string& str) {
             ids.push_back(id);
         }
     }
-    return ids;
 }
 
 std::wstring utf8_to_wstring(const std::string& str) {
@@ -484,8 +588,7 @@ void byte_encode_token(const std::string& token,
   }
 }
 
-bool HuggingfaceTokenizer::load(const std::string& filename) {
-    std::ifstream tok_file(filename);
+bool HuggingfaceTokenizer::load_vocab(std::ifstream& tok_file) {
     std::string line, token;
     // get nums
     int vocab_len, merge_len;
@@ -506,7 +609,6 @@ bool HuggingfaceTokenizer::load(const std::string& filename) {
         bpe_ranks_.insert({{utf8_to_wstring(line.substr(0, d)),
                             utf8_to_wstring(line.substr(d + 1))}, i});
     }
-    tok_file.close();
     // bytes_to_unicode
      auto _insert_range = [=](int start, int end) {
         for (int c = start; c <= end; c++) {
@@ -601,8 +703,8 @@ void HuggingfaceTokenizer::bpe(const std::wstring& token, const BPERanks& bpe_ra
     }
 }
 
-std::vector<int> HuggingfaceTokenizer::encode(const std::string& str) {
-    std::regex re("('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?(_+)| ?[[:digit:]]+| ?[^\\s\\w]+|\\s+)");
+void HuggingfaceTokenizer::encode(const std::string& str, std::vector<int>& ids) {
+    std::regex re("('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\\s\\w]+|\\s+)");
     std::string input = str;
     std::vector<std::string> result;
     std::string token;
@@ -622,21 +724,22 @@ std::vector<int> HuggingfaceTokenizer::encode(const std::string& str) {
             result.push_back(wstring_to_utf8(ws));
         }
     }
-    std::vector<int> ids;
     for (auto s : result) {
         ids.push_back(encoder_.at(s));
     }
-    return ids;
 }
 
 std::string HuggingfaceTokenizer::decode(int id) {
+    // printf("decode id = %d, %lu, %s#\n", id, decoder_.size(), decoder_.at(id).c_str());
     if (id >= decoder_.size()) {
         return "";
     }
     std::wstring w = utf8_to_wstring(decoder_.at(id));
     std::string r;
     for (wchar_t c : w) {
-        r.push_back(char(u2b_.at(c)));
+        if (u2b_.find(c) != u2b_.end()) {
+            r.push_back(char(u2b_.at(c)));
+        }
     }
     return r;
 }
diff --git a/transformers/llm/export/llm_export.py b/transformers/llm/export/llm_export.py
index 08a644455..4b541b247 100644
--- a/transformers/llm/export/llm_export.py
+++ b/transformers/llm/export/llm_export.py
@@ -1,6 +1,7 @@
 import os
 import base64
 import glob
+import json
 import shutil
 import argparse
 import torch
@@ -66,8 +67,8 @@ def __init__(self, lm):
 
     def forward(self, hidden_states):
         m_logits = self.lm(hidden_states)
-        token = torch.argmax(m_logits)
-        return token
+        # token = torch.argmax(m_logits)
+        return m_logits
 
 class LLM(torch.nn.Module):
     '''
@@ -90,16 +91,19 @@ def __init__(self, args):
         # default is False, just set True when using below command:
         # `python llm_export ../path --export --embed_bin` to export single model without embedding
         self.without_embed = False
-        self.embed_bin = args.embed_bin
-        if self.embed_bin:
-            self.embed_bf16 = True
-        else:
-            self.embed_bf16 = args.embed_bf16
+        self.embed_bin = True
+        self.embed_bf16 = args.embed_bf16
         self.skip_slim = args.skip_slim
         tokenizer_model = os.path.join(args.path, 'tokenizer.model')
-        if os.path.exists(tokenizer_model):
-            self.sp_model = spm.SentencePieceProcessor(tokenizer_model)
-        else:
+        ice_text_model = os.path.join(args.path, 'ice_text.model')
+        try:
+            if os.path.exists(tokenizer_model):
+                self.sp_model = spm.SentencePieceProcessor(tokenizer_model)
+            elif os.path.exists(ice_text_model):
+                self.sp_model = spm.SentencePieceProcessor(ice_text_model)
+            else:
+                self.sp_model = None
+        except:
             self.sp_model = None
         merge_file = os.path.join(args.path, 'merges.txt')
         if os.path.exists(merge_file):
@@ -113,10 +117,21 @@ def __init__(self, args):
         self.lora_path = args.lora_path
         self.load_hf(args.path)
         self.load_model()
+        self.llm_config = {
+            'hidden_size' : self.hidden_size,
+            'layer_nums' : self.block_nums,
+            'attention_mask': self.attention_mask_type,
+            'key_value_shape': self.past_kv_shape[1:],
+            "prompt_template": self.build_prompt('%s'),
+            'is_visual': False
+        }
 
     def load_hf(self, model_path: str):
         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-        self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).float().eval()
+        try:
+            self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).float().eval()
+        except:
+            self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True).float().eval()
         self.config = self.model.config
         if self.lora_path is not None:
             adapter = PeftModel.from_pretrained(self.model, model_id=self.lora_path)
@@ -149,11 +164,11 @@ def __decode(self, hidden_states, attention_mask, position_ids, past_key_values)
         for i in range(self.block_nums):
             hidden_states, kv = self.blocks[i](hidden_states, attention_mask, position_ids, past_key_values[i])
             presents.append(kv)
-        token_id = self.lm(hidden_states).view(1)
+        logits = self.lm(hidden_states).reshape(-1)
         presents = torch.stack(presents)
         self.seq_len += 1
         self.token_len += 1
-        return token_id, presents
+        return logits, presents
 
     def forward(self, input_ids, attention_mask, position_ids, past_key_values):
         if self.without_embed:
@@ -188,8 +203,9 @@ def response(self, query):
         while self.token_len < self.max_length:
             attention_mask = self.get_attention_mask()
             position_ids = self.get_position_ids()
-            token_id, past_key_values = self.forward(token_id, attention_mask, position_ids, past_key_values)
-            if token_id == self.stop_id or token_id in self.stop_ids:
+            logits, past_key_values = self.forward(token_id, attention_mask, position_ids, past_key_values)
+            token_id = torch.argmax(logits)
+            if token_id in self.stop_ids:
                 print("", end='\n')
                 break
             word = self.id_to_str(token_id)
@@ -218,7 +234,7 @@ def export_lm(self):
                         onnx_model,
                         verbose=self.export_verbose,
                         input_names=['hidden_states'],
-                        output_names=['token_id'],
+                        output_names=['logits'],
                         do_constant_folding=True,
                         opset_version=15)
         if not self.skip_slim:
@@ -272,7 +288,7 @@ def export_embed(self):
             tensor_data = model.embed.weight.data
             data_ptr = tensor_data.untyped_storage().data_ptr()
             buffer = (ctypes.c_byte * (tensor_data.numel() * 2)).from_address(data_ptr)
-            with open(f'./{self.mnn_path}/embeddings_bf16.bin', 'wb') as f:
+            with open(f'./{self.onnx_path}/embeddings_bf16.bin', 'wb') as f:
                 f.write(buffer)
             return
         input_ids = torch.arange(3, dtype=torch.long)
@@ -341,6 +357,11 @@ def export_blocks(self):
         for i in range(self.block_nums):
             self.export_block(i)
 
+    def export_config(self, is_single = True):
+        self.llm_config['is_single'] = is_single
+        with open(f'./{self.onnx_path}/llm_config.json', 'w', encoding='utf-8') as f:
+            json.dump(self.llm_config, f, ensure_ascii=False, indent=4)
+
     def export(self):
         model = self
         self.seq_len = 3
@@ -361,13 +382,23 @@ def export(self):
             input_names=[
                 'input_ids', 'attention_mask', 'position_ids', 'past_key_values'
             ],
-            output_names=['token_id', 'presents'],
+            output_names=['logits', 'presents'],
             dynamic_axes=self.model_dynamic_axes,
             do_constant_folding=True,
             opset_version=15)
         print('export done!')
         if not self.skip_slim:
             slim(onnx_model, output_model=onnx_model)
+            for file_path in glob.glob(f'./{self.onnx_path}/onnx__*'):
+                try:
+                    os.remove(file_path)
+                except FileNotFoundError:
+                    pass
+            for file_path in glob.glob(f'./{self.onnx_path}/model.*'):
+                try:
+                    os.remove(file_path)
+                except FileNotFoundError:
+                    pass
         if self.export_test:
             # test
             original_outs = model(input_ids, attention_mask, position_ids, past_key_values)
@@ -387,13 +418,36 @@ def export(self):
             self.without_embed = False
 
     def export_tokenizer(self):
+        # TOKENIZER MAGIC NUMBER
+        MAGIC_NUMBER = 430
+        # TOKENIZER TYPE
+        SENTENCEPIECE = 0; TIKTOIKEN = 1; BERT = 2; HUGGINGFACE = 3
+        def write_line(fp, *args):
+            for arg in args:
+                for token in arg:
+                    fp.write(str(token) + ' ')
+            fp.write('\n')
+        def write_header(fp, type, speicals, prefix = []):
+            fp.write(f'{MAGIC_NUMBER} {type}\n')
+            fp.write(f'{len(speicals)} {len(self.stop_ids)} {len(prefix)}\n')
+            write_line(fp, speicals, self.stop_ids, prefix)
+
         file_path = os.path.join(self.onnx_path, "tokenizer.txt")
+        special_list = list(self.tokenizer.added_tokens_decoder.keys())
+        if hasattr(self.tokenizer, 'special_tokens'):
+            for k, v in self.tokenizer.special_tokens.items():
+                special_list.append(v)
+        if hasattr(self.tokenizer, 'gmask_token_id'):
+            special_list.append(self.tokenizer.gmask_token_id)
+        vocab_list = []
+        prefix_list = []
+        if hasattr(self.tokenizer, 'get_prefix_tokens'):
+            prefix_list = self.tokenizer.get_prefix_tokens()
         if self.sp_model is not None:
             # senetencepiece
             print('# senetencepiece tokenier')
             NORMAL = 1; UNKNOWN = 2; CONTROL = 3
             USER_DEFINED = 4; UNUSED = 5; BYTE = 6
-            fp = open(file_path, "w", encoding="utf8")
             for i in range(self.sp_model.GetPieceSize()):
                 token = self.sp_model.IdToPiece(i)
                 score = self.sp_model.GetScore(i)
@@ -412,23 +466,37 @@ def export_tokenizer(self):
                     if '<|blank_' in token: token = ' ' * int(token[8:token.find('|>')])
                 if '▁' in token: token = token.replace('▁', ' ')
                 token_encode = base64.b64encode(token.encode("utf-8")).decode("utf8")
-                fp.write(f'{token_encode} {score} {type}\n')
-            fp.close()
+                vocab_list.append(f'{token_encode} {score} {type}\n')
+            with open(file_path, "w", encoding="utf8") as fp:
+                write_header(fp, SENTENCEPIECE, special_list, prefix_list)
+                fp.write(f'{len(vocab_list)}\n')
+                for vocab in vocab_list:
+                    fp.write(vocab)
         elif hasattr(self.tokenizer, 'mergeable_ranks'):
             print('# tiktoken tokenier')
             # tikton
+            vocab_list = []
+            for k, v in self.tokenizer.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf8") + "\n"
+                vocab_list.append(line)
+            if hasattr(self.tokenizer, 'special_tokens'):
+                for k, v in self.tokenizer.special_tokens.items():
+                    line = base64.b64encode(k.encode("utf-8")).decode("utf8") + "\n"
+                    vocab_list.append(line)
+            if hasattr(self.tokenizer, 'added_tokens_decoder'):
+                for k, v in self.tokenizer.added_tokens_decoder.items():
+                    line = base64.b64encode(v.__str__().encode("utf-8")).decode("utf8") + "\n"
+                    vocab_list.append(line)
             with open(file_path, "w", encoding="utf8") as fp:
-                for k, v in self.tokenizer.mergeable_ranks.items():
-                    line = base64.b64encode(k).decode("utf8") + "\n"
-                    fp.write(line)
-                if hasattr(self.tokenizer, 'special_tokens'):
-                    for k, v in self.tokenizer.special_tokens.items():
-                        line = base64.b64encode(k.encode("utf-8")).decode("utf8") + "\n"
-                        fp.write(line)
+                write_header(fp, TIKTOIKEN, special_list, prefix_list)
+                fp.write(f'{len(vocab_list)}\n')
+                for vocab in vocab_list:
+                    fp.write(vocab)
         elif self.merge_txt is not None:
             # huggingface tokenizer
             merge_list = []
             vocab = self.tokenizer.get_vocab()
+            special_list = list(self.tokenizer.added_tokens_decoder.keys())
             vocab_list = ['<unk>' for i in range(len(vocab))]
             # load vocab
             for k, v in vocab.items():
@@ -439,13 +507,15 @@ def export_tokenizer(self):
                     merge_list.append(line)
             # write to tokenizer.txt
             with open(file_path, "w", encoding="utf8") as fp:
+                write_header(fp, HUGGINGFACE, special_list)
                 fp.write(f'{len(vocab_list)} {len(merge_list)}\n')
                 for v in vocab_list:
                     fp.write(v + '\n')
                 for m in merge_list:
                     fp.write(m)
         else:
-            # huggingface tokenizer
+            print('# other tiktoken tokenier')
+            # other tikton
             def unicode_to_byte(u: int):
                 if u >= 256 and u <= 288:
                     return u - 256
@@ -458,25 +528,28 @@ def unicode_to_byte(u: int):
                 if u == 9601:  # _
                     return 95
                 return u
+            vocab = self.tokenizer.get_vocab()
+            vocab_list = ['<unk>' for i in range(len(vocab))]
+            for k, v in vocab.items():
+                try:
+                    vocab_list[int(v)] = bytes([unicode_to_byte(ord(c)) for c in k]).decode('utf-8', errors='ignore')
+                except:
+                    vocab_list[int(v)] = k
+            special_list = list(self.tokenizer.added_tokens_decoder.keys())
             with open(file_path, "w", encoding="utf8") as fp:
-                vocab = self.tokenizer.get_vocab()
-                vocab_list = ['<unk>' for i in range(len(vocab))]
-                for k, v in vocab.items():
-                    try:
-                        vocab_list[int(v)] = bytes([unicode_to_byte(ord(c)) for c in k]).decode('utf-8', errors='ignore')
-                    except:
-                        vocab_list[int(v)] = k
+                write_header(fp, TIKTOIKEN, special_list)
+                fp.write(f'{len(vocab_list)}\n')
                 for v in vocab_list:
                     line = base64.b64encode(v.encode('utf-8')).decode("utf8") + "\n"
                     fp.write(line)
 
-
 # chatglm
 class GLMBlock(torch.nn.Module):
     def __init__(self, block, block_id, final_layernorm = None):
         super().__init__()
         self.block = block
         self.block_id = block_id
+        self.hidden_size = 4096
         self.final_layernorm = final_layernorm
 
     def forward(self, hidden_states, attention_mask, position_ids, past_kv):
@@ -495,8 +568,9 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv):
 
 class Chatglm_6b(LLM):
     def __init__(self, args):
-        super().__init__(args)
+        self.attention_mask_type = 'glm'
         self.model_name = 'Chatglm_6b'
+        super().__init__(args)
 
     def load_model(self):
         transformer = self.model.transformer
@@ -505,7 +579,7 @@ def load_model(self):
         self.blocks_ = transformer.layers
         self.final_layernorm_ = transformer.final_layernorm
         # some wrapper
-        self.stop_id = self.tokenizer._convert_token_to_id(self.tokenizer.eos_token)
+        self.stop_ids.append(self.tokenizer._convert_token_to_id(self.tokenizer.eos_token))
         self.block_nums = len(self.blocks_)
         self.lm = Lm(self.lm_)
         # chatglm embedding and lm using same param, copy embedding when using bf16
@@ -535,31 +609,38 @@ def get_attention_mask(self) -> torch.Tensor:
         if self.token_len:
             return torch.zeros([1]).bool().reshape([1, 1, 1, 1])
         attention_mask = torch.zeros([self.seq_len, self.seq_len], dtype=torch.bool)
-        for i in range(self.seq_len):
+        for i in range(self.seq_len - 1):
             attention_mask[i][-1] = True
         attention_mask = attention_mask.reshape([1, 1, self.seq_len, self.seq_len])
         return attention_mask
 
     def get_position_ids(self) -> torch.Tensor:
         if self.token_len:
-            return torch.tensor([1, self.seq_len - self.context_len]).reshape([1, 2, 1])
+            return torch.tensor([self.context_len, self.token_len + 1]).reshape([1, 2, 1])
         position_ids_0 = torch.arange(self.seq_len, dtype=torch.long)
         position_ids_1 = torch.zeros(self.seq_len, dtype=torch.long)
+        position_ids_0[-1] = position_ids_0[-2]
         position_ids_1[-1] = 1
         position_ids = torch.stack([position_ids_0, position_ids_1]).view(1, 2, -1)
         return position_ids
 
+    def build_prompt(self, query):
+        return f'{query}[gMASK]<sop>'
+
 # chatglm2
 class GLM2Block(torch.nn.Module):
-    def __init__(self, block, block_id, final_layernorm = None):
+    def __init__(self, block, block_id, config, final_layernorm = None):
         super().__init__()
         self.block = block
         self.block_id = block_id
         self.final_layernorm = final_layernorm
+        self.config = config
         self.hidden_size = 4096
 
     def forward(self, hidden_states, attention_mask, position_ids, past_kv):
-        theta = 1.0 / (10000 ** (torch.arange(0, 64, 2, dtype=torch.float32) / 64))
+        rope_ratio = self.config.rope_ratio
+        base = 10000 * rope_ratio
+        theta = 1.0 / (base ** (torch.arange(0, 64, 2, dtype=torch.float32) / 64))
         position_ids = position_ids.float().reshape(-1, 1)
         idx_theta = position_ids * theta
         rotary_pos_emb = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1).unsqueeze(0).contiguous()
@@ -576,6 +657,7 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv):
 
 class Chatglm2_6b(LLM):
     def __init__(self, args):
+        self.attention_mask_type = 'glm2'
         super().__init__(args)
         self.model_name = 'Chatglm2_6b'
         if 'codegeex2-6b' in args.path:
@@ -588,14 +670,21 @@ def load_model(self):
         self.blocks_ = transformer.encoder.layers
         self.final_layernorm_ = transformer.encoder.final_layernorm
         # some wrapper
-        self.stop_id = self.tokenizer.eos_token_id
-        if self.stop_id is None:
+        if self.tokenizer.eos_token_id is None:
             # codegeex2-6b
-            self.stop_id = self.tokenizer.tokenizer.eos_id
+            self.stop_ids.append(self.tokenizer.tokenizer.eos_id)
+        else:
+            self.stop_ids.append(self.tokenizer.eos_token_id)
+        if hasattr(self.config, 'eos_token_id'):
+            if type(self.config.eos_token_id) is list:
+                for eos_id in self.config.eos_token_id:
+                    self.stop_ids.append(eos_id)
+            elif type(self.config.eos_token_id) is int:
+                self.stop_ids.append(self.config.eos_token_id)
         self.block_nums = len(self.blocks_)
         self.embed = Embedding(self.embed_, self.embed_bf16)
         self.lm = Lm(self.lm_)
-        self.blocks = [GLM2Block(self.blocks_[i], i, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
+        self.blocks = [GLM2Block(self.blocks_[i], i, self.config, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
         # some config for export
         self.past_kv_shape = [28, 2, 0, 1, 2, 128]
         self.block_dynamic_axes = {
@@ -610,6 +699,21 @@ def load_model(self):
             "position_ids" : { 0: "seq_len" },
             "past_key_values" : { 2: "history_len" }
         }
+        num_layers = self.config.num_layers
+        if num_layers > 28:
+            self.past_kv_shape = [num_layers, 2, 1, 2, 0, 128]
+            self.block_dynamic_axes = {
+                "inputs_embeds" : { 0: "seq_len" },
+                "attention_mask" : { 2: "seq_len", 3: "seq_len" },
+                "position_ids" : { 0: "seq_len" },
+                "past_key_values" : { 3: "history_len" }
+            }
+            self.model_dynamic_axes = {
+                "input_ids" : { 0: "seq_len" },
+                "attention_mask" : { 2: "seq_len", 3: "seq_len" },
+                "position_ids" : { 0: "seq_len" },
+                "past_key_values" : { 4: "history_len" }
+            }
 
     def get_attention_mask(self) -> torch.Tensor:
         if self.token_len:
@@ -691,7 +795,17 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv):
 
 class Qwen_Chat(LLM):
     def __init__(self, args):
+        self.attention_mask_type = 'int'
         super().__init__(args)
+        if 'VL' in self.model_name:
+            self.llm_config['is_visual'] = True
+            self.llm_config['attention_mask'] = 'float'
+            self.llm_config['img_size'] = 448
+            self.llm_config['imgpad_len'] = 256
+            self.llm_config['img_start'] = self.tokenizer.img_start_id
+            self.llm_config['img_end'] = self.tokenizer.img_end_id
+            self.llm_config['img_pad'] = self.tokenizer.img_pad_id
+
 
     def load_model(self):
         # Qwen models
@@ -710,7 +824,7 @@ def load_model(self):
             self.image_start_id = transformer.config.visual['image_start_id']
             self.image_size = transformer.config.visual['image_size']
         # some wrapper
-        self.stop_id = self.tokenizer.im_end_id
+        self.stop_ids.append(self.tokenizer.im_end_id)
         self.block_nums = len(self.blocks_)
         self.hidden_size = transformer.embed_dim
         self.embed = Embedding(self.embed_, self.embed_bf16)
@@ -742,7 +856,7 @@ def build_prompt(self, query):
     def get_attention_mask(self) -> torch.Tensor:
         if self.model_name == 'Qwen-VL':
             if self.token_len:
-                return torch.zeros([1, 1, 1, self.seq_len], dtype=torch.float32)
+                return torch.zeros([1, 1, 1, 1], dtype=torch.float32)
             return (1 - torch.tril(torch.ones([1, 1, self.seq_len, self.seq_len]))) * torch.finfo(torch.float32).min
         if self.token_len:
             return torch.ones([1, 1, 1, 1]).bool()
@@ -794,6 +908,7 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv):
                                              past_key_value=past_kv,
                                              rotary_pos_emb=rotary_pos_emb,
                                              use_cache=True)
+
         if self.final_layernorm is not None:
             hidden_states = self.final_layernorm(hidden_states)
             hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size)
@@ -804,6 +919,7 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv):
 
 class Qwen2_Chat(LLM):
     def __init__(self, args):
+        self.attention_mask_type = 'float'
         super().__init__(args)
 
     def load_model(self):
@@ -815,14 +931,14 @@ def load_model(self):
         self.blocks_ = transformer.layers
         self.final_layernorm_ = transformer.norm
         # some wrapper
-        self.stop_id = self.tokenizer.eos_token_id
+        self.stop_ids.append(self.tokenizer.eos_token_id)
         if hasattr(self.model, 'generation_config'):
-            self.stop_ids.append(self.stop_id)
             for id in self.model.generation_config.eos_token_id:
                 self.stop_ids.append(id)
         self.block_nums = self.config.num_hidden_layers
         self.hidden_size = self.config.hidden_size
         self.num_heads = self.config.num_attention_heads
+        self.kv_heads = self.config.num_key_value_heads
         self.rope_theta = self.config.rope_theta
         self.head_dim = self.hidden_size // self.num_heads
         if self.embed_.weight is self.lm_.weight:
@@ -832,7 +948,7 @@ def load_model(self):
         else:
             self.embed = Embedding(self.embed_, self.embed_bf16)
         self.lm = Lm(self.lm_)
-        self.past_kv_shape = [self.block_nums, 2, 1, 0, self.num_heads, self.head_dim]
+        self.past_kv_shape = [self.block_nums, 2, 1, 0, self.kv_heads, self.head_dim]
         self.blocks = [QWEN2Block(self.model_name, self.blocks_[i], i, self.config, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
         # some config for export
         self.block_dynamic_axes = {
@@ -881,19 +997,28 @@ def visual_embed(self, input_ids):
 
 # llama2
 class LLAMA2Block(torch.nn.Module):
-    def __init__(self, block, block_id, hidden_size, final_layernorm = None):
+    def __init__(self, block, block_id, hidden_size, head_dim, final_layernorm = None):
         super().__init__()
         self.block = block
         self.block_id = block_id
+        self.head_dim = head_dim
         self.final_layernorm = final_layernorm
         self.hidden_size = hidden_size
 
     def forward(self, hidden_states, attention_mask, position_ids, past_kv):
+        theta = 1.0 / (10000.0 ** (torch.arange(0, self.head_dim, 2, dtype=torch.float32) / self.head_dim))
+        position_ids = position_ids.float().reshape(-1, 1)
+        idx_theta = position_ids * theta
+        rotary_pos_emb = torch.cat((idx_theta, idx_theta), dim=-1)
+        rotary_pos_emb = rotary_pos_emb.unsqueeze(1).unsqueeze(0)
+        rotary_pos_emb = torch.stack([torch.cos(rotary_pos_emb), torch.sin(rotary_pos_emb)])
         hidden_states = hidden_states.view(1, -1, self.hidden_size)
+        position_ids = position_ids.view(1, -1)
         hidden_states, presents = self.block(hidden_states,
                                              attention_mask,
                                              position_ids,
                                              past_kv,
+                                             rotary_pos_emb=rotary_pos_emb,
                                              use_cache=True)
         if self.final_layernorm is not None:
             hidden_states = self.final_layernorm(hidden_states)
@@ -904,6 +1029,7 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv):
 
 class Llama2_7b_Chat(LLM):
     def __init__(self, args):
+        self.attention_mask_type = 'float'
         self.model_name = 'Llama2_7b'
         if 'Baichuan2' in args.path:
             self.model_name = 'Baichuan2_7B'
@@ -928,33 +1054,35 @@ def load_model(self):
         self.final_layernorm_ = transformer.norm
         # some wrapper
         self.hidden_size = self.embed_.weight.shape[-1]
-        self.stop_id = self.tokenizer.eos_token_id
+        self.stop_ids.append(self.tokenizer.eos_token_id)
         if hasattr(self.model, 'generation_config'):
-            self.stop_ids.append(self.stop_id)
             self.stop_ids.append(self.model.generation_config.eos_token_id)
         if self.model_name == 'Llama3_8B':
             self.stop_ids.append(self.tokenizer.convert_tokens_to_ids("<|eot_id|>"))
         self.block_nums = len(self.blocks_)
         self.embed = Embedding(self.embed_, self.embed_bf16)
         self.lm = Lm(self.lm_)
-        self.blocks = [LLAMA2Block(self.blocks_[i], i, self.hidden_size, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
         self.block_nums = self.config.num_hidden_layers
         self.hidden_size = self.config.hidden_size
         self.num_attention_heads = self.config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_attention_heads
-        self.num_key_value_heads = self.config.num_key_value_heads
-        self.past_kv_shape = [self.block_nums, 2, 1, self.num_key_value_heads, 0, self.head_dim]
+        if hasattr(self.config, 'num_key_value_heads'):
+            self.num_key_value_heads = self.config.num_key_value_heads
+        else:
+            self.num_key_value_heads = self.config.num_attention_heads
+        self.blocks = [LLAMA2Block(self.blocks_[i], i, self.hidden_size, self.head_dim, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
+        self.past_kv_shape = [self.block_nums, 2, 1, 0, self.num_key_value_heads, self.head_dim]
         self.block_dynamic_axes = {
             "inputs_embeds" : { 0: "seq_len" },
             "attention_mask" : { 2: "seq_len", 3: "seq_len" },
             "position_ids" : { 1: "seq_len" },
-            "past_key_values" : { 3: "history_len" }
+            "past_key_values" : { 2: "history_len" }
         }
         self.model_dynamic_axes = {
             "input_ids" : { 0: "seq_len" },
             "attention_mask" : { 2: "seq_len", 3: "seq_len" },
             "position_ids" : { 1: "seq_len" },
-            "past_key_values" : { 4: "history_len" }
+            "past_key_values" : { 3: "history_len" }
         }
 
     def build_prompt(self, query):
@@ -967,10 +1095,10 @@ def build_prompt(self, query):
         if 'Yi' in self.model_name:
             return f'<|im_start|> user\n{query}<|im_end|>\n<|im_start|> assistant\n'
         if 'deepseek' in self.model_name:
-            return f'<|begin▁of▁sentence|>User: {query}\nAssistant:'
+            return f'<|begin_of_sentence|>User: {query}\n\nAssistant:'
         if 'Llama3' in self.model_name:
             return f'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{query}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
-        return f'[INST]{query}[/INST]'
+        return f'<s>[INST]{query}[/INST]'
 
     def get_attention_mask(self) -> torch.Tensor:
         if self.token_len:
@@ -1007,6 +1135,7 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv):
 
 class phi_2(LLM):
     def __init__(self, args):
+        self.attention_mask_type = 'glm'
         super().__init__(args)
         self.model_name = 'phi-2'
         self.asymmetric = False # TODO: some precision bug when using asymmetric
@@ -1019,7 +1148,7 @@ def load_model(self):
         self.blocks_ = transformer.h
         # self.final_layernorm_ = transformer.final_layernorm
         # some wrapper
-        self.stop_id = self.tokenizer.eos_token_id
+        self.stop_ids.append(self.tokenizer.eos_token_id)
         self.block_nums = len(self.blocks_)
         self.embed = Embedding(self.embed_, self.embed_bf16)
         self.lm = Lm(self.lm_)
@@ -1067,6 +1196,8 @@ def forward(self, hidden_states, attention_mask):
 
 class bge(LLM):
     def __init__(self, args):
+        self.attention_mask_type = 'int'
+        self.past_kv_shape = []
         super().__init__(args)
         self.model_name = 'bge-large-zh'
 
@@ -1092,13 +1223,14 @@ def response(self, query):
         return res
 
     def load_model(self):
+        self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True).float().eval()
         transformer = self.model.encoder
         self.lm_ = self.model.pooler
         self.embed_ = self.model.embeddings
         self.hidden_size = self.embed_.word_embeddings.weight.shape[-1]
         self.blocks_ = transformer.layer
         # some wrapper
-        self.stop_id = self.tokenizer.eos_token_id
+        self.stop_ids = []
         self.block_nums = len(self.blocks_)
         self.embed = self.embed_
         self.lm = self.lm_
@@ -1157,6 +1289,9 @@ def export(self):
         if self.export_mnn:
             onnx2mnn(onnx_model, self.mnn_path, 8, True, bizCode=token_str)
 
+    def build_prompt(self, query):
+            return f'[CLS]{query}[SEP]'
+
     def get_position_ids(self) -> torch.Tensor:
         return torch.arange(self.seq_len, dtype=torch.long).unsqueeze(0)
 
@@ -1189,8 +1324,9 @@ def export(self):
     llm_models = {
         'chatglm-6b': Chatglm_6b,
         'chatglm2-6b': Chatglm2_6b,
-        'chatglm3-6b': Chatglm3_6b,
         'codegeex2-6b': Chatglm2_6b,
+        'chatglm3-6b': Chatglm3_6b,
+        'glm-4-9b-chat': Chatglm3_6b,
         'Qwen-7B-Chat': Qwen_Chat,
         'Qwen-1_8B-Chat': Qwen_Chat,
         'Qwen-1_8B': Qwen_Chat,
@@ -1199,6 +1335,9 @@ def export(self):
         'Qwen1_5-1_8B-Chat': Qwen2_Chat,
         'Qwen1_5-4B-Chat': Qwen2_Chat,
         'Qwen1_5-7B-Chat': Qwen2_Chat,
+        'Qwen2-0_5B-Instruct': Qwen2_Chat,
+        'Qwen2-1_5B-Instruct': Qwen2_Chat,
+        'Qwen2-7B-Instruct': Qwen2_Chat,
         'Baichuan2-7B-Chat': Llama2_7b_Chat,
         'Llama-2-7b-chat-ms': Llama2_7b_Chat,
         'Llama-3-8B-Instruct': Llama2_7b_Chat,
@@ -1235,16 +1374,17 @@ def export(self):
                         '\n\t- block models.'
                         '\n\t- lm_head model.'
                         )
-    parser.add_argument('--export_token', action='store_true', help='export llm tokenizer to a txt file.')
-    parser.add_argument('--export_embed', action='store_true', help='export llm embedding to an `onnx` model.')
     parser.add_argument('--export_visual', action='store_true', help='export llm visual model to an `onnx` model.')
     parser.add_argument('--export_lm', action='store_true', help='export llm lm_head to an `onnx` model.')
     parser.add_argument('--export_block', type=int, help='export llm block [id] to an `onnx` model.')
     parser.add_argument('--export_blocks', action='store_true', help='export llm all blocks to `onnx` models.')
-    parser.add_argument('--embed_bin', action='store_true', help='export embedding weight as bin file with dtype `bfloat16`')
-    parser.add_argument('--embed_bf16', action='store_true', help='using `bfloat16` replace `float32` in embedding.')
     parser.add_argument('--skip_slim', action='store_true', help='Whether or not to skip onnx-slim.')
 
+    # No use now, add invoid of call error
+    parser.add_argument('--export_token', action='store_true', help='export llm tokenizer to a txt file.')
+    parser.add_argument('--export_embed', action='store_true', help='export llm embedding to an `onnx` model.')
+    parser.add_argument('--embed_bf16', default=True, action='store_true', help='using `bfloat16` replace `float32` in embedding.')
+    parser.add_argument('--embed_bin', action='store_true', help='export embedding weight as bin file with dtype `bfloat16`')
 
     args = parser.parse_args()
     model_path = args.path
@@ -1267,14 +1407,15 @@ def export(self):
     if args.test is not None:
         llm_exporter.response(args.test)
 
+    if args.export or args.export_split:
+        llm_exporter.export_config(args.export)
+
     if args.export:
         llm_exporter.export()
 
-    if args.export_token:
-        llm_exporter.export_tokenizer()
+    llm_exporter.export_tokenizer()
 
-    if args.export_embed or args.export_split:
-        llm_exporter.export_embed()
+    llm_exporter.export_embed()
 
     if args.export_visual or args.export_split:
         llm_exporter.export_visual()
@@ -1286,4 +1427,4 @@ def export(self):
         llm_exporter.export_blocks()
 
     if args.export_block is not None:
-        llm_exporter.export_block(args.export_block)
+        llm_exporter.export_block(args.export_block)
\ No newline at end of file
diff --git a/transformers/llm/export/llm_models/Baichuan2-7B-Chat/modeling_baichuan.py b/transformers/llm/export/llm_models/Baichuan2-7B-Chat/modeling_baichuan.py
index 9f2968cc4..5a0b69e83 100755
--- a/transformers/llm/export/llm_models/Baichuan2-7B-Chat/modeling_baichuan.py
+++ b/transformers/llm/export/llm_models/Baichuan2-7B-Chat/modeling_baichuan.py
@@ -128,7 +128,7 @@ def forward(self, x, seq_len=None):
             self.sin_cached = emb.sin()[None, None, :, :].to(torch.float32).to(x.device)
         elif self.cos_cached.device != x.device:
             self.cos_cached = self.cos_cached.to(x.device)
-            self.sin_cached = self.sin_cached.to(x.device)  
+            self.sin_cached = self.sin_cached.to(x.device)
         return (
             self.cos_cached[:, :, :seq_len, ...],
             self.sin_cached[:, :, :seq_len, ...],
@@ -149,8 +149,8 @@ def apply_rotary_pos_emb(q, k, cos_, sin_, position_ids):
     # cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
     # sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
     # print(f'### q.shape = {q.shape}, cos.shape = {cos.shape}')
-    cos = cos[position_ids]
-    sin = sin[position_ids]
+    # cos = cos[position_ids]
+    # sin = sin[position_ids]
     q_embed = (q.float() * cos) + (rotate_half(q.float()) * sin)
     k_embed = (k.float() * cos) + (rotate_half(k.float()) * sin)
     return q_embed.to(q.dtype), k_embed.to(k.dtype)
@@ -205,6 +205,7 @@ def forward(
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.LongTensor] = None,
             past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            rotary_pos_emb: Optional[torch.Tensor] = None,
             output_attentions: bool = False,
             use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
@@ -212,6 +213,7 @@ def forward(
 
         proj = self.W_pack(hidden_states)
         proj = proj.reshape([1, -1, 3, 4096]).permute([2, 0, 1, 3])
+        '''
         # proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2)
         query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
@@ -220,7 +222,10 @@ def forward(
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        if rotary_pos_emb is None:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        else:
+            cos, sin = rotary_pos_emb
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         # [bsz, nh, t, hd]
 
@@ -239,13 +244,35 @@ def forward(
                 query_states, key_states, value_states, attn_bias=xops.LowerTriangularMask()
             )
         else:
-            '''
-            with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True):
-                attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = attention_mask)
-            '''
             attn_output = self.raw_atten(query_states, key_states, value_states, attention_mask)
             attn_output = attn_output.transpose(1, 2)
-
+        '''
+        #---------------
+        query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim)
+        value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim)
+        kv_seq_len = key_states.shape[1]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[1]
+        # rope
+        cos, sin = rotary_pos_emb
+        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
+        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
+        # kv cache
+        if past_key_value is not None:
+            past_key, past_value = past_key_value[0], past_key_value[1]
+            key_states = torch.cat((past_key, key_states), dim=1)
+            value_states = torch.cat((past_value, value_states), dim=1)
+        past_key_value = torch.stack((key_states, value_states))
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.permute([0, 2, 3, 1])
+        value_states = value_states.transpose(1, 2)
+        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        #---------------
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
 
@@ -274,6 +301,7 @@ def forward(
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.LongTensor] = None,
             past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            rotary_pos_emb: Optional[torch.Tensor] = None,
             output_attentions: Optional[bool] = False,
             use_cache: Optional[bool] = False,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
@@ -288,6 +316,7 @@ def forward(
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_value=past_key_value,
+            rotary_pos_emb=rotary_pos_emb,
             output_attentions=output_attentions,
             use_cache=use_cache,
         )
@@ -567,7 +596,7 @@ def set_decoder(self, decoder):
 
     def get_decoder(self):
         return self.model
-    
+
     @classmethod
     def from_pretrained(
         cls,
@@ -603,7 +632,7 @@ def from_pretrained(
             )
         else:
             model_kwargs = kwargs
-        
+
         if hasattr(config, "quantization_config") and config.quantization_config['load_in_4bit']:
             try:
                 from .quantizer import init_model_weight_int4
@@ -611,20 +640,20 @@ def from_pretrained(
                 from accelerate.utils import CustomDtype
                 from accelerate.utils import get_balanced_memory
             except ImportError:
-                raise ImportError(f"Needs import model weight init func to run quantize.") 
+                raise ImportError(f"Needs import model weight init func to run quantize.")
             # Instantiate model.
             init_contexts = [no_init_weights(_enable=True)]
             init_contexts.append(init_empty_weights())
             with ContextManagers(init_contexts):
                 model = cls(config)
-            
+
             model_file = os.path.join(pretrained_model_name_or_path, 'pytorch_model.bin')
-            state_dict = torch.load(model_file, map_location="cpu") 
+            state_dict = torch.load(model_file, map_location="cpu")
             model.is_quantized = True
-                        
+
             device_map = kwargs.pop("device_map", None)
             torch_dtype = kwargs.pop("torch_dtype", None)
-            
+
             kwargs = {"no_split_module_classes": model._no_split_modules}
             target_dtype = CustomDtype.INT4
             max_memory = get_balanced_memory(
@@ -635,10 +664,10 @@ def from_pretrained(
                 **kwargs,
             )
             kwargs["max_memory"] = max_memory
-            
+
             device_map = infer_auto_device_map(model, dtype=target_dtype, **kwargs)
             model = init_model_weight_int4(config, model, state_dict)
-            
+
             # Set model in evaluation mode to deactivate DropOut modules by default
             model.eval()
             # If it is a model with generation capabilities, attempt to load the generation config
@@ -663,15 +692,15 @@ def from_pretrained(
                         "Generation config file not found, using a generation config created from the model config."
                     )
                     pass
-            
+
             if device_map is not None:
                 dispatch_model(model, device_map=device_map)
-            
+
             return model
-        return super(BaichuanForCausalLM, cls).from_pretrained(pretrained_model_name_or_path, *model_args, 
-                config=config, cache_dir=cache_dir, ignore_mismatched_sizes=ignore_mismatched_sizes, 
-                force_download=force_download, local_files_only=local_files_only, token=token, revision=revision, 
-                use_safetensors=use_safetensors, **kwargs)   
+        return super(BaichuanForCausalLM, cls).from_pretrained(pretrained_model_name_or_path, *model_args,
+                config=config, cache_dir=cache_dir, ignore_mismatched_sizes=ignore_mismatched_sizes,
+                force_download=force_download, local_files_only=local_files_only, token=token, revision=revision,
+                use_safetensors=use_safetensors, **kwargs)
 
     def forward(
             self,
diff --git a/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/modeling_llama.py b/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/modeling_llama.py
index 8c562c604..493b040b7 100644
--- a/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/modeling_llama.py
+++ b/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/modeling_llama.py
@@ -182,8 +182,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
     # sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
     cos = torch.squeeze(cos)  # [seq_len, dim]
     sin = torch.squeeze(sin)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    # cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    # sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
@@ -282,6 +282,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
@@ -306,7 +307,7 @@ def forward(
             query_states = self.q_proj(hidden_states)
             key_states = self.k_proj(hidden_states)
             value_states = self.v_proj(hidden_states)
-
+        '''
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -314,7 +315,10 @@ def forward(
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        if rotary_pos_emb is None:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        else:
+            cos, sin = rotary_pos_emb
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
         if past_key_value is not None:
@@ -327,8 +331,32 @@ def forward(
         # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        '''
+        #---------------
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        kv_seq_len = key_states.shape[1]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[1]
+        # rope
+        cos, sin = rotary_pos_emb
+        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
+        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
+        # kv cache
+        if past_key_value is not None:
+            past_key, past_value = past_key_value[0], past_key_value[1]
+            key_states = torch.cat((past_key, key_states), dim=1)
+            value_states = torch.cat((past_value, value_states), dim=1)
+        past_key_value = torch.stack((key_states, value_states))
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.permute([0, 2, 3, 1])
+        value_states = value_states.transpose(1, 2)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        #---------------
+        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
 
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
@@ -384,6 +412,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
@@ -411,6 +440,7 @@ def forward(
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_value=past_key_value,
+            rotary_pos_emb=rotary_pos_emb,
             output_attentions=output_attentions,
             use_cache=use_cache,
         )
diff --git a/transformers/llm/export/llm_models/Llama-3-8B-Instruct/modeling_llama.py b/transformers/llm/export/llm_models/Llama-3-8B-Instruct/modeling_llama.py
index 8c562c604..493b040b7 100644
--- a/transformers/llm/export/llm_models/Llama-3-8B-Instruct/modeling_llama.py
+++ b/transformers/llm/export/llm_models/Llama-3-8B-Instruct/modeling_llama.py
@@ -182,8 +182,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
     # sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
     cos = torch.squeeze(cos)  # [seq_len, dim]
     sin = torch.squeeze(sin)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    # cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    # sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
@@ -282,6 +282,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
@@ -306,7 +307,7 @@ def forward(
             query_states = self.q_proj(hidden_states)
             key_states = self.k_proj(hidden_states)
             value_states = self.v_proj(hidden_states)
-
+        '''
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -314,7 +315,10 @@ def forward(
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        if rotary_pos_emb is None:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        else:
+            cos, sin = rotary_pos_emb
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
         if past_key_value is not None:
@@ -327,8 +331,32 @@ def forward(
         # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        '''
+        #---------------
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        kv_seq_len = key_states.shape[1]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[1]
+        # rope
+        cos, sin = rotary_pos_emb
+        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
+        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
+        # kv cache
+        if past_key_value is not None:
+            past_key, past_value = past_key_value[0], past_key_value[1]
+            key_states = torch.cat((past_key, key_states), dim=1)
+            value_states = torch.cat((past_value, value_states), dim=1)
+        past_key_value = torch.stack((key_states, value_states))
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.permute([0, 2, 3, 1])
+        value_states = value_states.transpose(1, 2)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        #---------------
+        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
 
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
@@ -384,6 +412,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
@@ -411,6 +440,7 @@ def forward(
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_value=past_key_value,
+            rotary_pos_emb=rotary_pos_emb,
             output_attentions=output_attentions,
             use_cache=use_cache,
         )
diff --git a/transformers/llm/export/llm_models/MiniCPM-1.2b/config.json b/transformers/llm/export/llm_models/MiniCPM-1.2b/config.json
old mode 100755
new mode 100644
diff --git a/transformers/llm/export/llm_models/MiniCPM-2.4b/config.json b/transformers/llm/export/llm_models/MiniCPM-2.4b/config.json
old mode 100755
new mode 100644
diff --git a/transformers/llm/export/llm_models/Qwen-7B-Chat/modeling_qwen.py b/transformers/llm/export/llm_models/Qwen-7B-Chat/modeling_qwen.py
index 94437bcd2..698486f6f 100644
--- a/transformers/llm/export/llm_models/Qwen-7B-Chat/modeling_qwen.py
+++ b/transformers/llm/export/llm_models/Qwen-7B-Chat/modeling_qwen.py
@@ -250,12 +250,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         attn_weights = torch.matmul(query, key.transpose(-1, -2))
 
         if self.scale_attn_weights:
-            attn_weights = attn_weights / torch.full(
-                [],
-                value.size(-1) ** 0.5,
-                dtype=attn_weights.dtype,
-                device=attn_weights.device,
-            )
+            attn_weights = attn_weights / math.sqrt(self.head_dim)
 
         query_length, key_length = query.size(-2), key.size(-2)
         if attention_mask is None:
@@ -813,7 +808,7 @@ def __init__(self, config):
                 logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
             elif SUPPORT_FP16:
                 logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
-        
+
         if config.use_flash_attn == "auto":
             if config.bf16 or config.fp16:
                 logger.warn("Try importing flash-attention for faster inference...")
diff --git a/transformers/llm/export/llm_models/Qwen-VL-Chat/modeling_qwen.py b/transformers/llm/export/llm_models/Qwen-VL-Chat/modeling_qwen.py
index 94c7453b1..d7b3c4798 100755
--- a/transformers/llm/export/llm_models/Qwen-VL-Chat/modeling_qwen.py
+++ b/transformers/llm/export/llm_models/Qwen-VL-Chat/modeling_qwen.py
@@ -149,14 +149,8 @@ def _attn(self, query, key, value, registered_causal_mask, attention_mask=None,
         attn_weights = torch.matmul(query, key.transpose(-1, -2))
 
         if self.scale_attn_weights:
-            attn_weights = attn_weights / torch.full(
-                [],
-                value.size(-1) ** 0.5,
-                dtype=attn_weights.dtype,
-                device=attn_weights.device,
-            )
+            attn_weights = attn_weights / math.sqrt(self.head_dim)
 
-        query_length, key_length = query.size(-2), key.size(-2)
         # causal_mask = self.bias[
         #     :, :, key_length - query_length : key_length, :key_length
         # ]
@@ -295,7 +289,7 @@ def forward(
         else:
             present = None
 
-        if self.use_logn_attn and not self.training:
+        if self.use_logn_attn and not self.training and False:
             if self.logn_tensor.device != query.device or self.logn_tensor.dtype != query.dtype:
                 self.logn_tensor = self.logn_tensor.to(query.device).type_as(query)
             seq_start = key.size(1) - query.size(1)
@@ -515,7 +509,7 @@ def get_input_embeddings(self):
 
     def set_input_embeddings(self, new_embeddings):
         self.wte = new_embeddings
-    
+
     # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
     def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
         # create causal mask
@@ -1110,7 +1104,7 @@ def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0):
             self._ntk_alpha_cached = ntk_alpha
             seq = torch.arange(self._seq_len_cached, device=self.inv_freq.device)
             freqs = torch.outer(seq.type_as(self.inv_freq), self.inv_freq)
-            
+
             emb = torch.cat((freqs, freqs), dim=-1)
             from einops import rearrange
 
diff --git a/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/modeling_qwen2.py
index 8afc2ecb5..595a3e91c 100644
--- a/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/modeling_qwen2.py
+++ b/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/modeling_qwen2.py
@@ -313,10 +313,12 @@ def forward(
             value_states = torch.cat((past_value, value_states), dim=1)
         past_key_value = torch.stack((key_states, value_states))
         query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
+        key_states = key_states.permute([0, 2, 3, 1])
         value_states = value_states.transpose(1, 2)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
         #---------------
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
 
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
diff --git a/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/modeling_qwen2.py
index 8afc2ecb5..595a3e91c 100644
--- a/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/modeling_qwen2.py
+++ b/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/modeling_qwen2.py
@@ -313,10 +313,12 @@ def forward(
             value_states = torch.cat((past_value, value_states), dim=1)
         past_key_value = torch.stack((key_states, value_states))
         query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
+        key_states = key_states.permute([0, 2, 3, 1])
         value_states = value_states.transpose(1, 2)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
         #---------------
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
 
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
diff --git a/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/modeling_qwen2.py
index 8afc2ecb5..595a3e91c 100644
--- a/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/modeling_qwen2.py
+++ b/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/modeling_qwen2.py
@@ -313,10 +313,12 @@ def forward(
             value_states = torch.cat((past_value, value_states), dim=1)
         past_key_value = torch.stack((key_states, value_states))
         query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
+        key_states = key_states.permute([0, 2, 3, 1])
         value_states = value_states.transpose(1, 2)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
         #---------------
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
 
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
diff --git a/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/modeling_qwen2.py
index 8afc2ecb5..595a3e91c 100644
--- a/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/modeling_qwen2.py
+++ b/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/modeling_qwen2.py
@@ -313,10 +313,12 @@ def forward(
             value_states = torch.cat((past_value, value_states), dim=1)
         past_key_value = torch.stack((key_states, value_states))
         query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
+        key_states = key_states.permute([0, 2, 3, 1])
         value_states = value_states.transpose(1, 2)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
         #---------------
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
 
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
diff --git a/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/config.json b/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/config.json
new file mode 100755
index 000000000..8f9ea8a58
--- /dev/null
+++ b/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/config.json
@@ -0,0 +1,31 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_qwen2.Qwen2Config",
+    "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
+  },
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.1",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
diff --git a/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/configuration_qwen2.py b/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/configuration_qwen2.py
new file mode 100644
index 000000000..b6ca1ed43
--- /dev/null
+++ b/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/configuration_qwen2.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Qwen2 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
+}
+
+
+class Qwen2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
+    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import Qwen2Model, Qwen2Config
+
+    >>> # Initializing a Qwen2 style configuration
+    >>> configuration = Qwen2Config()
+
+    >>> # Initializing a model from the Qwen2-7B style configuration
+    >>> model = Qwen2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/modeling_qwen2.py
new file mode 100644
index 000000000..595a3e91c
--- /dev/null
+++ b/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/modeling_qwen2.py
@@ -0,0 +1,1436 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Qwen2 model."""
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_qwen2 import Qwen2Config
+
+
+# if is_flash_attn_2_available():
+    #from flash_attn import flash_attn_func, flash_attn_varlen_func
+    #from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    #_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+
+logger = logging.get_logger(__name__)
+
+
+_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
+_CONFIG_FOR_DOC = "Qwen2Config"
+
+QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Qwen/Qwen2-7B-beta",
+    # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
+]
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
+class Qwen2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
+class Qwen2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Qwen2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.rotary_emb = Qwen2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_pos_emb: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        '''
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            kv_seq_len += past_key_value[0].shape[2]
+        if rotary_pos_emb is None:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        else:
+            cos, sin = rotary_pos_emb
+            query_states = (query_states * cos) + (rotate_half(query_states) * sin)
+            key_states = (key_states * cos) + (rotate_half(key_states) * sin)
+
+        if past_key_value is not None:
+            past_key, past_value = past_key_value[0], past_key_value[1]
+            key_states = torch.cat((past_key, key_states), dim=2)
+            value_states = torch.cat((past_value, value_states), dim=2)
+            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        past_key_value = torch.stack((key_states, value_states))
+        # repeat k/v heads if n_kv_heads < n_heads
+        # key_states = repeat_kv(key_states, self.num_key_value_groups)
+        # value_states = repeat_kv(value_states, self.num_key_value_groups)
+        '''
+        #---------------
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        kv_seq_len = key_states.shape[1]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[1]
+        # rope
+        cos, sin = rotary_pos_emb
+        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
+        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
+        # kv cache
+        if past_key_value is not None:
+            past_key, past_value = past_key_value[0], past_key_value[1]
+            key_states = torch.cat((past_key, key_states), dim=1)
+            value_states = torch.cat((past_value, value_states), dim=1)
+        past_key_value = torch.stack((key_states, value_states))
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.permute([0, 2, 3, 1])
+        value_states = value_states.transpose(1, 2)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        #---------------
+        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Qwen2FlashAttention2(Qwen2Attention):
+    """
+    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
+    as the weights of the module stays untouched. The only required change would be on the forward pass
+    where it needs to correctly call the public API of flash attention and deal with padding tokens
+    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
+    config.max_window_layers layers.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+            and self.config.use_sliding_window
+        )
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Decide whether to use SWA or not by layer index.
+        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
+            use_sliding_windows = False
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+        return attn_output
+
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2
+class Qwen2SdpaAttention(Qwen2Attention):
+    """
+    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from Qwen2Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+QWEN2_ATTENTION_CLASSES = {
+    "eager": Qwen2Attention,
+    "flash_attention_2": Qwen2FlashAttention2,
+    "sdpa": Qwen2SdpaAttention,
+}
+
+
+class Qwen2DecoderLayer(nn.Module):
+    def __init__(self, config: Qwen2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+        # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.self_attn = Qwen2Attention(config, layer_idx)
+
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_pos_emb: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+                "Please make sure use `attention_mask` instead.`"
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            rotary_pos_emb=rotary_pos_emb,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+QWEN2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Qwen2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2PreTrainedModel(PreTrainedModel):
+    config_class = Qwen2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+QWEN2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2Model(Qwen2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
+
+    Args:
+        config: Qwen2Config
+    """
+
+    def __init__(self, config: Qwen2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        past_key_values_length = 0
+
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Qwen2ForCausalLM(Qwen2PreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
+
+        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The Qwen2 Model transformer with a sequence classification head on top (linear layer).
+
+    [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Qwen2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/config.json b/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/config.json
new file mode 100755
index 000000000..bdc572b07
--- /dev/null
+++ b/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/config.json
@@ -0,0 +1,31 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_qwen2.Qwen2Config",
+    "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
+  },
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.1",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/configuration_qwen2.py b/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/configuration_qwen2.py
new file mode 100644
index 000000000..b6ca1ed43
--- /dev/null
+++ b/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/configuration_qwen2.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Qwen2 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
+}
+
+
+class Qwen2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
+    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import Qwen2Model, Qwen2Config
+
+    >>> # Initializing a Qwen2 style configuration
+    >>> configuration = Qwen2Config()
+
+    >>> # Initializing a model from the Qwen2-7B style configuration
+    >>> model = Qwen2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/modeling_qwen2.py
new file mode 100644
index 000000000..595a3e91c
--- /dev/null
+++ b/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/modeling_qwen2.py
@@ -0,0 +1,1436 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Qwen2 model."""
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_qwen2 import Qwen2Config
+
+
+# if is_flash_attn_2_available():
+    #from flash_attn import flash_attn_func, flash_attn_varlen_func
+    #from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    #_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+
+logger = logging.get_logger(__name__)
+
+
+_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
+_CONFIG_FOR_DOC = "Qwen2Config"
+
+QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Qwen/Qwen2-7B-beta",
+    # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
+]
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
+class Qwen2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
+class Qwen2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Qwen2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.rotary_emb = Qwen2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_pos_emb: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        '''
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            kv_seq_len += past_key_value[0].shape[2]
+        if rotary_pos_emb is None:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        else:
+            cos, sin = rotary_pos_emb
+            query_states = (query_states * cos) + (rotate_half(query_states) * sin)
+            key_states = (key_states * cos) + (rotate_half(key_states) * sin)
+
+        if past_key_value is not None:
+            past_key, past_value = past_key_value[0], past_key_value[1]
+            key_states = torch.cat((past_key, key_states), dim=2)
+            value_states = torch.cat((past_value, value_states), dim=2)
+            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        past_key_value = torch.stack((key_states, value_states))
+        # repeat k/v heads if n_kv_heads < n_heads
+        # key_states = repeat_kv(key_states, self.num_key_value_groups)
+        # value_states = repeat_kv(value_states, self.num_key_value_groups)
+        '''
+        #---------------
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        kv_seq_len = key_states.shape[1]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[1]
+        # rope
+        cos, sin = rotary_pos_emb
+        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
+        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
+        # kv cache
+        if past_key_value is not None:
+            past_key, past_value = past_key_value[0], past_key_value[1]
+            key_states = torch.cat((past_key, key_states), dim=1)
+            value_states = torch.cat((past_value, value_states), dim=1)
+        past_key_value = torch.stack((key_states, value_states))
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.permute([0, 2, 3, 1])
+        value_states = value_states.transpose(1, 2)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        #---------------
+        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Qwen2FlashAttention2(Qwen2Attention):
+    """
+    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
+    as the weights of the module stays untouched. The only required change would be on the forward pass
+    where it needs to correctly call the public API of flash attention and deal with padding tokens
+    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
+    config.max_window_layers layers.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+            and self.config.use_sliding_window
+        )
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Decide whether to use SWA or not by layer index.
+        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
+            use_sliding_windows = False
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+        return attn_output
+
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2
+class Qwen2SdpaAttention(Qwen2Attention):
+    """
+    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from Qwen2Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+QWEN2_ATTENTION_CLASSES = {
+    "eager": Qwen2Attention,
+    "flash_attention_2": Qwen2FlashAttention2,
+    "sdpa": Qwen2SdpaAttention,
+}
+
+
+class Qwen2DecoderLayer(nn.Module):
+    def __init__(self, config: Qwen2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+        # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.self_attn = Qwen2Attention(config, layer_idx)
+
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_pos_emb: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+                "Please make sure use `attention_mask` instead.`"
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            rotary_pos_emb=rotary_pos_emb,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+QWEN2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Qwen2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2PreTrainedModel(PreTrainedModel):
+    config_class = Qwen2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+QWEN2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2Model(Qwen2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
+
+    Args:
+        config: Qwen2Config
+    """
+
+    def __init__(self, config: Qwen2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        past_key_values_length = 0
+
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Qwen2ForCausalLM(Qwen2PreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
+
+        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The Qwen2 Model transformer with a sequence classification head on top (linear layer).
+
+    [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Qwen2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B/config.json b/transformers/llm/export/llm_models/Qwen2-1_5B/config.json
new file mode 100755
index 000000000..08a0ac476
--- /dev/null
+++ b/transformers/llm/export/llm_models/Qwen2-1_5B/config.json
@@ -0,0 +1,31 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_qwen2.Qwen2Config",
+    "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
+  },
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 131072,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B/configuration_qwen2.py b/transformers/llm/export/llm_models/Qwen2-1_5B/configuration_qwen2.py
new file mode 100644
index 000000000..b6ca1ed43
--- /dev/null
+++ b/transformers/llm/export/llm_models/Qwen2-1_5B/configuration_qwen2.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Qwen2 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
+}
+
+
+class Qwen2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
+    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import Qwen2Model, Qwen2Config
+
+    >>> # Initializing a Qwen2 style configuration
+    >>> configuration = Qwen2Config()
+
+    >>> # Initializing a model from the Qwen2-7B style configuration
+    >>> model = Qwen2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen2-1_5B/modeling_qwen2.py
new file mode 100644
index 000000000..f8d5b5345
--- /dev/null
+++ b/transformers/llm/export/llm_models/Qwen2-1_5B/modeling_qwen2.py
@@ -0,0 +1,1434 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Qwen2 model."""
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_qwen2 import Qwen2Config
+
+
+# if is_flash_attn_2_available():
+    #from flash_attn import flash_attn_func, flash_attn_varlen_func
+    #from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    #_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+
+logger = logging.get_logger(__name__)
+
+
+_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
+_CONFIG_FOR_DOC = "Qwen2Config"
+
+QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Qwen/Qwen2-7B-beta",
+    # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
+]
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
+class Qwen2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
+class Qwen2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Qwen2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.rotary_emb = Qwen2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_pos_emb: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        '''
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            kv_seq_len += past_key_value[0].shape[2]
+        if rotary_pos_emb is None:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        else:
+            cos, sin = rotary_pos_emb
+            query_states = (query_states * cos) + (rotate_half(query_states) * sin)
+            key_states = (key_states * cos) + (rotate_half(key_states) * sin)
+
+        if past_key_value is not None:
+            past_key, past_value = past_key_value[0], past_key_value[1]
+            key_states = torch.cat((past_key, key_states), dim=2)
+            value_states = torch.cat((past_value, value_states), dim=2)
+            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        past_key_value = torch.stack((key_states, value_states))
+        '''
+        #---------------
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        kv_seq_len = key_states.shape[1]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[1]
+        # rope
+        cos, sin = rotary_pos_emb
+        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
+        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
+        # kv cache
+        if past_key_value is not None:
+            past_key, past_value = past_key_value[0], past_key_value[1]
+            key_states = torch.cat((past_key, key_states), dim=1)
+            value_states = torch.cat((past_value, value_states), dim=1)
+        past_key_value = torch.stack((key_states, value_states))
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        #---------------
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Qwen2FlashAttention2(Qwen2Attention):
+    """
+    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
+    as the weights of the module stays untouched. The only required change would be on the forward pass
+    where it needs to correctly call the public API of flash attention and deal with padding tokens
+    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
+    config.max_window_layers layers.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+            and self.config.use_sliding_window
+        )
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Decide whether to use SWA or not by layer index.
+        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
+            use_sliding_windows = False
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+        return attn_output
+
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2
+class Qwen2SdpaAttention(Qwen2Attention):
+    """
+    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from Qwen2Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+QWEN2_ATTENTION_CLASSES = {
+    "eager": Qwen2Attention,
+    "flash_attention_2": Qwen2FlashAttention2,
+    "sdpa": Qwen2SdpaAttention,
+}
+
+
+class Qwen2DecoderLayer(nn.Module):
+    def __init__(self, config: Qwen2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+        # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.self_attn = Qwen2Attention(config, layer_idx)
+
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_pos_emb: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+                "Please make sure use `attention_mask` instead.`"
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            rotary_pos_emb=rotary_pos_emb,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+QWEN2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Qwen2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2PreTrainedModel(PreTrainedModel):
+    config_class = Qwen2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+QWEN2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2Model(Qwen2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
+
+    Args:
+        config: Qwen2Config
+    """
+
+    def __init__(self, config: Qwen2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        past_key_values_length = 0
+
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Qwen2ForCausalLM(Qwen2PreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
+
+        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The Qwen2 Model transformer with a sequence classification head on top (linear layer).
+
+    [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Qwen2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/transformers/llm/export/llm_models/Qwen2-7B-Instruct/config.json b/transformers/llm/export/llm_models/Qwen2-7B-Instruct/config.json
new file mode 100755
index 000000000..eac7cd285
--- /dev/null
+++ b/transformers/llm/export/llm_models/Qwen2-7B-Instruct/config.json
@@ -0,0 +1,31 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_qwen2.Qwen2Config",
+    "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
+  },
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.41.2",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
diff --git a/transformers/llm/export/llm_models/Qwen2-7B-Instruct/configuration_qwen2.py b/transformers/llm/export/llm_models/Qwen2-7B-Instruct/configuration_qwen2.py
new file mode 100644
index 000000000..b6ca1ed43
--- /dev/null
+++ b/transformers/llm/export/llm_models/Qwen2-7B-Instruct/configuration_qwen2.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Qwen2 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
+}
+
+
+class Qwen2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
+    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import Qwen2Model, Qwen2Config
+
+    >>> # Initializing a Qwen2 style configuration
+    >>> configuration = Qwen2Config()
+
+    >>> # Initializing a model from the Qwen2-7B style configuration
+    >>> model = Qwen2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/transformers/llm/export/llm_models/Qwen2-7B-Instruct/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen2-7B-Instruct/modeling_qwen2.py
new file mode 100644
index 000000000..595a3e91c
--- /dev/null
+++ b/transformers/llm/export/llm_models/Qwen2-7B-Instruct/modeling_qwen2.py
@@ -0,0 +1,1436 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Qwen2 model."""
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_qwen2 import Qwen2Config
+
+
+# if is_flash_attn_2_available():
+    #from flash_attn import flash_attn_func, flash_attn_varlen_func
+    #from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    #_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+
+logger = logging.get_logger(__name__)
+
+
+_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
+_CONFIG_FOR_DOC = "Qwen2Config"
+
+QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Qwen/Qwen2-7B-beta",
+    # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
+]
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
+class Qwen2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
+class Qwen2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Qwen2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.rotary_emb = Qwen2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_pos_emb: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        '''
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            kv_seq_len += past_key_value[0].shape[2]
+        if rotary_pos_emb is None:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        else:
+            cos, sin = rotary_pos_emb
+            query_states = (query_states * cos) + (rotate_half(query_states) * sin)
+            key_states = (key_states * cos) + (rotate_half(key_states) * sin)
+
+        if past_key_value is not None:
+            past_key, past_value = past_key_value[0], past_key_value[1]
+            key_states = torch.cat((past_key, key_states), dim=2)
+            value_states = torch.cat((past_value, value_states), dim=2)
+            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        past_key_value = torch.stack((key_states, value_states))
+        # repeat k/v heads if n_kv_heads < n_heads
+        # key_states = repeat_kv(key_states, self.num_key_value_groups)
+        # value_states = repeat_kv(value_states, self.num_key_value_groups)
+        '''
+        #---------------
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        kv_seq_len = key_states.shape[1]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[1]
+        # rope
+        cos, sin = rotary_pos_emb
+        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
+        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
+        # kv cache
+        if past_key_value is not None:
+            past_key, past_value = past_key_value[0], past_key_value[1]
+            key_states = torch.cat((past_key, key_states), dim=1)
+            value_states = torch.cat((past_value, value_states), dim=1)
+        past_key_value = torch.stack((key_states, value_states))
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.permute([0, 2, 3, 1])
+        value_states = value_states.transpose(1, 2)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        #---------------
+        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Qwen2FlashAttention2(Qwen2Attention):
+    """
+    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
+    as the weights of the module stays untouched. The only required change would be on the forward pass
+    where it needs to correctly call the public API of flash attention and deal with padding tokens
+    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
+    config.max_window_layers layers.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+            and self.config.use_sliding_window
+        )
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Decide whether to use SWA or not by layer index.
+        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
+            use_sliding_windows = False
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+        return attn_output
+
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2
+class Qwen2SdpaAttention(Qwen2Attention):
+    """
+    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from Qwen2Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+QWEN2_ATTENTION_CLASSES = {
+    "eager": Qwen2Attention,
+    "flash_attention_2": Qwen2FlashAttention2,
+    "sdpa": Qwen2SdpaAttention,
+}
+
+
+class Qwen2DecoderLayer(nn.Module):
+    def __init__(self, config: Qwen2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+        # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.self_attn = Qwen2Attention(config, layer_idx)
+
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_pos_emb: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+                "Please make sure use `attention_mask` instead.`"
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            rotary_pos_emb=rotary_pos_emb,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+QWEN2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Qwen2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2PreTrainedModel(PreTrainedModel):
+    config_class = Qwen2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+QWEN2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2Model(Qwen2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
+
+    Args:
+        config: Qwen2Config
+    """
+
+    def __init__(self, config: Qwen2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        past_key_values_length = 0
+
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Qwen2ForCausalLM(Qwen2PreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
+
+        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The Qwen2 Model transformer with a sequence classification head on top (linear layer).
+
+    [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Qwen2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/transformers/llm/export/llm_models/TinyLlama-1.1B-Chat/config.json b/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/config.json
similarity index 100%
rename from transformers/llm/export/llm_models/TinyLlama-1.1B-Chat/config.json
rename to transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/config.json
diff --git a/transformers/llm/export/llm_models/TinyLlama-1.1B-Chat/configuration_llama.py b/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/configuration_llama.py
similarity index 100%
rename from transformers/llm/export/llm_models/TinyLlama-1.1B-Chat/configuration_llama.py
rename to transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/configuration_llama.py
diff --git a/transformers/llm/export/llm_models/TinyLlama-1.1B-Chat/modeling_llama.py b/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/modeling_llama.py
similarity index 95%
rename from transformers/llm/export/llm_models/TinyLlama-1.1B-Chat/modeling_llama.py
rename to transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/modeling_llama.py
index 8c562c604..493b040b7 100644
--- a/transformers/llm/export/llm_models/TinyLlama-1.1B-Chat/modeling_llama.py
+++ b/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/modeling_llama.py
@@ -182,8 +182,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
     # sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
     cos = torch.squeeze(cos)  # [seq_len, dim]
     sin = torch.squeeze(sin)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    # cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    # sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
@@ -282,6 +282,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
@@ -306,7 +307,7 @@ def forward(
             query_states = self.q_proj(hidden_states)
             key_states = self.k_proj(hidden_states)
             value_states = self.v_proj(hidden_states)
-
+        '''
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -314,7 +315,10 @@ def forward(
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        if rotary_pos_emb is None:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        else:
+            cos, sin = rotary_pos_emb
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
         if past_key_value is not None:
@@ -327,8 +331,32 @@ def forward(
         # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        '''
+        #---------------
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        kv_seq_len = key_states.shape[1]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[1]
+        # rope
+        cos, sin = rotary_pos_emb
+        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
+        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
+        # kv cache
+        if past_key_value is not None:
+            past_key, past_value = past_key_value[0], past_key_value[1]
+            key_states = torch.cat((past_key, key_states), dim=1)
+            value_states = torch.cat((past_value, value_states), dim=1)
+        past_key_value = torch.stack((key_states, value_states))
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.permute([0, 2, 3, 1])
+        value_states = value_states.transpose(1, 2)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        #---------------
+        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
 
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
@@ -384,6 +412,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
@@ -411,6 +440,7 @@ def forward(
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_value=past_key_value,
+            rotary_pos_emb=rotary_pos_emb,
             output_attentions=output_attentions,
             use_cache=use_cache,
         )
diff --git a/transformers/llm/export/llm_models/Yi-6B-Chat/modeling_llama.py b/transformers/llm/export/llm_models/Yi-6B-Chat/modeling_llama.py
index 8c562c604..493b040b7 100644
--- a/transformers/llm/export/llm_models/Yi-6B-Chat/modeling_llama.py
+++ b/transformers/llm/export/llm_models/Yi-6B-Chat/modeling_llama.py
@@ -182,8 +182,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
     # sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
     cos = torch.squeeze(cos)  # [seq_len, dim]
     sin = torch.squeeze(sin)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    # cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    # sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
@@ -282,6 +282,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
@@ -306,7 +307,7 @@ def forward(
             query_states = self.q_proj(hidden_states)
             key_states = self.k_proj(hidden_states)
             value_states = self.v_proj(hidden_states)
-
+        '''
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -314,7 +315,10 @@ def forward(
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        if rotary_pos_emb is None:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        else:
+            cos, sin = rotary_pos_emb
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
         if past_key_value is not None:
@@ -327,8 +331,32 @@ def forward(
         # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        '''
+        #---------------
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        kv_seq_len = key_states.shape[1]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[1]
+        # rope
+        cos, sin = rotary_pos_emb
+        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
+        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
+        # kv cache
+        if past_key_value is not None:
+            past_key, past_value = past_key_value[0], past_key_value[1]
+            key_states = torch.cat((past_key, key_states), dim=1)
+            value_states = torch.cat((past_value, value_states), dim=1)
+        past_key_value = torch.stack((key_states, value_states))
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.permute([0, 2, 3, 1])
+        value_states = value_states.transpose(1, 2)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        #---------------
+        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
 
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
@@ -384,6 +412,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
@@ -411,6 +440,7 @@ def forward(
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_value=past_key_value,
+            rotary_pos_emb=rotary_pos_emb,
             output_attentions=output_attentions,
             use_cache=use_cache,
         )
diff --git a/transformers/llm/export/llm_models/deepseek-llm-7b-chat/modeling_llama.py b/transformers/llm/export/llm_models/deepseek-llm-7b-chat/modeling_llama.py
index 8c562c604..493b040b7 100644
--- a/transformers/llm/export/llm_models/deepseek-llm-7b-chat/modeling_llama.py
+++ b/transformers/llm/export/llm_models/deepseek-llm-7b-chat/modeling_llama.py
@@ -182,8 +182,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
     # sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
     cos = torch.squeeze(cos)  # [seq_len, dim]
     sin = torch.squeeze(sin)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    # cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    # sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
@@ -282,6 +282,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
@@ -306,7 +307,7 @@ def forward(
             query_states = self.q_proj(hidden_states)
             key_states = self.k_proj(hidden_states)
             value_states = self.v_proj(hidden_states)
-
+        '''
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -314,7 +315,10 @@ def forward(
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        if rotary_pos_emb is None:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        else:
+            cos, sin = rotary_pos_emb
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
         if past_key_value is not None:
@@ -327,8 +331,32 @@ def forward(
         # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        '''
+        #---------------
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        kv_seq_len = key_states.shape[1]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[1]
+        # rope
+        cos, sin = rotary_pos_emb
+        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
+        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
+        # kv cache
+        if past_key_value is not None:
+            past_key, past_value = past_key_value[0], past_key_value[1]
+            key_states = torch.cat((past_key, key_states), dim=1)
+            value_states = torch.cat((past_value, value_states), dim=1)
+        past_key_value = torch.stack((key_states, value_states))
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.permute([0, 2, 3, 1])
+        value_states = value_states.transpose(1, 2)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        #---------------
+        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
 
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
@@ -384,6 +412,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
@@ -411,6 +440,7 @@ def forward(
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_value=past_key_value,
+            rotary_pos_emb=rotary_pos_emb,
             output_attentions=output_attentions,
             use_cache=use_cache,
         )
diff --git a/transformers/llm/export/llm_models/glm-4-9b-chat/modeling_chatglm.py b/transformers/llm/export/llm_models/glm-4-9b-chat/modeling_chatglm.py
new file mode 100755
index 000000000..e86f5a2f4
--- /dev/null
+++ b/transformers/llm/export/llm_models/glm-4-9b-chat/modeling_chatglm.py
@@ -0,0 +1,1238 @@
+""" PyTorch ChatGLM model. """
+import json
+import math
+import copy
+import warnings
+import re
+import sys
+
+import torch
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
+from torch.nn.utils import skip_init
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any
+from copy import deepcopy
+
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
+
+from .configuration_chatglm import ChatGLMConfig
+
+# flags required to enable jit fusion kernels
+
+if sys.platform != 'darwin':
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_override_can_fuse_on_cpu(True)
+    torch._C._jit_override_can_fuse_on_gpu(True)
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
+_CONFIG_FOR_DOC = "ChatGLMConfig"
+
+def default_init(cls, *args, **kwargs):
+    return cls(*args, **kwargs)
+
+
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 198] = 5e4
+        return scores
+
+
+def split_tensor_along_last_dim(
+        tensor: torch.Tensor,
+        num_partitions: int,
+        contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """Split a tensor along its last dimension.
+
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+
+    Returns:
+        A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = tensor.size()[last_dim] // num_partitions
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, rope_ratio=1, original_impl=False, device=None, dtype=None):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.dim = dim
+        self.original_impl = original_impl
+        self.rope_ratio = rope_ratio
+
+    def forward_impl(
+            self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
+    ):
+        """Enhanced Transformer with Rotary Position Embedding.
+
+        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+        transformers/rope/__init__.py. MIT License:
+        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+        """
+        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+        base = base * self.rope_ratio
+        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
+
+        # Create position indexes `[0, 1, ..., seq_len - 1]`
+        seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
+
+        # Calculate the product of position index and $\theta_i$
+        idx_theta = torch.outer(seq_idx, theta).float()
+
+        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
+
+        # this is to mimic the behaviour of complex32, else we will get different results
+        if dtype in (torch.float16, torch.bfloat16, torch.int8):
+            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
+        return cache
+
+    def forward(self, max_seq_len, offset=0):
+        return self.forward_impl(
+            max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
+        )
+
+
+@torch.jit.script
+def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
+    # x: [b, np, sq, hn]
+    b, np, sq, hn = x.size(0), x.size(1), x.size(2), x.size(3)
+    rot_dim = rope_cache.shape[-2] * 2
+    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
+    # truncate to support variable sizes
+    rope_cache = rope_cache[:, :sq]
+    xshaped = x.reshape(b, np, sq, rot_dim // 2, 2)
+    rope_cache = rope_cache.view(-1, 1, sq, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
+            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return torch.cat((x_out2, x_pass), dim=-1)
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
+        self.eps = eps
+
+    def forward(self, hidden_states: torch.Tensor):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+
+        return (self.weight * hidden_states).to(input_dtype)
+
+
+class CoreAttention(torch.nn.Module):
+    def __init__(self, config: ChatGLMConfig, layer_number):
+        super(CoreAttention, self).__init__()
+
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+
+        projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_partition = projection_size
+        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.coeff = coeff
+
+        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
+
+    def raw_atten(self, query_layer, key_layer, value_layer, attention_mask):
+        attn_weights = torch.matmul(query_layer, key_layer.transpose(-1, -2)) / self.norm_factor
+        if attention_mask is None:
+            seq_len = query_layer.shape[2]
+            attention_mask = ~torch.tril(torch.ones([1, 1, seq_len, seq_len], device=attn_weights.device).bool())
+        attn_weights = attn_weights.masked_fill(attention_mask, float("-inf"))
+        #mask_value = torch.finfo(attn_weights.dtype).min
+        #attn_weights = torch.where(attention_mask, attn_weights.to(attn_weights.dtype), mask_value)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        context_layer = torch.matmul(attn_weights, value_layer)
+        return context_layer
+        context_layer = context_layer.transpose(1, 2).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+        context_layer = context_layer.reshape(*new_context_layer_shape)
+        return context_layer
+
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        pytorch_major_version = int(torch.__version__.split('.')[0])
+        if pytorch_major_version >= 2 and False:
+            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 is_causal=True)
+            else:
+                if attention_mask is not None:
+                    attention_mask = ~attention_mask
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 attention_mask)
+            context_layer = context_layer.transpose(1, 2).contiguous()
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.reshape(*new_context_layer_shape)
+        else:
+            # Raw attention scores
+
+            # [b, np, sq, sk]
+            output_size = (query_layer.size(0), query_layer.size(1), query_layer.size(2), key_layer.size(2))
+
+            # [b, np, sq, hn] -> [b * np, sq, hn]
+            query_layer = query_layer.view(output_size[0] * output_size[1], output_size[2], -1)
+            # [b, np, sk, hn] -> [b * np, sk, hn]
+            key_layer = key_layer.view(output_size[0] * output_size[1], output_size[3], -1)
+
+            # preallocting input tensor: [b * np, sq, sk]
+            matmul_input_buffer = torch.empty(
+                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
+                device=query_layer.device
+            )
+
+            # Raw attention scores. [b * np, sq, sk]
+            matmul_result = torch.baddbmm(
+                matmul_input_buffer,
+                query_layer,  # [b * np, sq, hn]
+                key_layer.transpose(1, 2),  # [b * np, hn, sk]
+                beta=0.0,
+                alpha=(1.0 / self.norm_factor),
+            )
+
+            # change view to [b, np, sq, sk]
+            attention_scores = matmul_result.view(*output_size)
+
+            # ===========================
+            # Attention probs and dropout
+            # ===========================
+
+            # attention scores and attention mask [b, np, sq, sk]
+            if self.attention_softmax_in_fp32:
+                attention_scores = attention_scores.float()
+            if self.coeff is not None:
+                attention_scores = attention_scores * self.coeff
+            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
+                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
+                                            device=attention_scores.device, dtype=torch.bool)
+                attention_mask.tril_()
+                attention_mask = ~attention_mask
+
+            if attention_mask is not None:
+                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
+            attention_probs = F.softmax(attention_scores, dim=-1)
+            attention_probs = attention_probs.type_as(value_layer)
+
+            # This is actually dropping out entire tokens to attend to, which might
+            # seem a bit unusual, but is taken from the original Transformer paper.
+            attention_probs = self.attention_dropout(attention_probs)
+            # =========================
+            # Context layer. [sq, b, hp]
+            # =========================
+
+            # value_layer -> context layer.
+            # [sk, b, np, hn] --> [b, np, sq, hn]
+
+            # context layer shape: [b, np, sq, hn]
+            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
+            # change view [b * np, sk, hn]
+            #value_layer = value_layer.view(output_size[0] * output_size[1], value_layer.size(2), -1)
+            # change view [b * np, sq, sk]
+            #attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+            # matmul: [b * np, sq, hn]
+            # context_layer = torch.bmm(attention_probs, value_layer)
+            context_layer = torch.matmul(attention_probs, value_layer)
+            # change view [b, np, sq, hn]
+            # context_layer = context_layer.view(*output_size)
+            # [b, np, sq, hn] --> [b, sq, np, hn]
+            context_layer = context_layer.transpose(1, 2).contiguous()
+            # [b, sq, np, hn] --> [b, sq, hp]
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.reshape(*new_context_layer_shape)
+
+        return context_layer
+
+
+class SelfAttention(torch.nn.Module):
+    """Parallel self-attention layer abstract class.
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(SelfAttention, self).__init__()
+        self.layer_number = max(1, layer_number)
+
+        self.projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        self.multi_query_attention = config.multi_query_attention
+        self.qkv_hidden_size = 3 * self.projection_size
+        if self.multi_query_attention:
+            self.num_multi_query_groups_per_partition = config.multi_query_group_num
+            self.qkv_hidden_size = (
+                    self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
+            )
+        self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
+                                         bias=config.add_bias_linear or config.add_qkv_bias,
+                                         device=device, **_config_to_kwargs(config)
+                                         )
+
+        self.core_attention = CoreAttention(config, self.layer_number)
+
+        # Output.
+        self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
+                               device=device, **_config_to_kwargs(config)
+                               )
+
+    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
+        if self.multi_query_attention:
+            num_attention_heads = self.num_multi_query_groups_per_partition
+        else:
+            num_attention_heads = self.num_attention_heads_per_partition
+        return torch.empty(
+            inference_max_sequence_len,
+            batch_size,
+            num_attention_heads,
+            self.hidden_size_per_attention_head,
+            dtype=dtype,
+            device=device,
+        )
+
+    def forward(self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True):
+        # hidden_states: [b, sq, h]
+
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        # =====================
+        # Query, Key, and Value
+        # =====================
+
+        # Attention heads [b, sq, h] --> [b, sq, (np * 3 * hn)]
+        mixed_x_layer = self.query_key_value(hidden_states)
+
+        if self.multi_query_attention:
+            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+                [
+                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                ],
+                dim=-1,
+            )
+            query_layer = query_layer.view(
+                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            key_layer = key_layer.view(
+                key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.view(
+                value_layer.size()[:-1]
+                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+        else:
+            new_tensor_shape = mixed_x_layer.size()[:-1] + \
+                               (self.num_attention_heads_per_partition,
+                                3 * self.hidden_size_per_attention_head)
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+            # [b, sq, np, 3 * hn] --> 3 [b, sq, np, hn]
+            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # [b, sq, np, hn] -> [b, np, sq, hn]
+        query_layer, key_layer, value_layer = [k.transpose(1, 2) for k in [query_layer, key_layer, value_layer]]
+
+        # apply relative positional encoding (rotary embedding)
+        if rotary_pos_emb is not None:
+            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
+            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
+
+        # adjust key and value for inference
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache
+            key_layer = torch.cat((cache_k, key_layer), dim=2)
+            value_layer = torch.cat((cache_v, value_layer), dim=2)
+        if use_cache:
+            '''
+            if kv_cache is None:
+                kv_cache = torch.cat((key_layer.unsqueeze(0).unsqueeze(0), value_layer.unsqueeze(0).unsqueeze(0)), dim=1)
+            else:
+                kv_cache = (key_layer, value_layer)
+            '''
+            kv_cache = torch.stack([key_layer, value_layer], axis=0)
+            # '''
+        else:
+            kv_cache = None
+
+        if self.multi_query_attention:
+            key_layer = key_layer.unsqueeze(2)
+            key_layer = key_layer.expand(
+                -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1, -1
+            )
+            key_layer = key_layer.contiguous().view(
+                key_layer.size()[:1] + (self.num_attention_heads_per_partition,) + key_layer.size()[3:]
+            )
+            value_layer = value_layer.unsqueeze(2)
+            value_layer = value_layer.expand(
+                -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1, -1
+            )
+            value_layer = value_layer.contiguous().view(
+                value_layer.size()[:1] + (self.num_attention_heads_per_partition,) + value_layer.size()[3:]
+            )
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output = self.dense(context_layer)
+
+        return output, kv_cache
+
+
+def _config_to_kwargs(args):
+    common_kwargs = {
+        "dtype": args.torch_dtype,
+    }
+    return common_kwargs
+
+
+class MLP(torch.nn.Module):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(MLP, self).__init__()
+
+        self.add_bias = config.add_bias_linear
+
+        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        self.dense_h_to_4h = nn.Linear(
+            config.hidden_size,
+            config.ffn_hidden_size * 2,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+
+        def swiglu(x):
+            x = torch.chunk(x, 2, dim=-1)
+            return F.silu(x[0]) * x[1]
+
+        self.activation_func = swiglu
+
+        # Project back to h.
+        self.dense_4h_to_h = nn.Linear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+        # [s, b, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        return output
+
+
+class GLMBlock(torch.nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(GLMBlock, self).__init__()
+        self.layer_number = layer_number
+
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                             dtype=config.torch_dtype)
+
+        # Self attention.
+        self.self_attention = SelfAttention(config, layer_number, device=device)
+        self.hidden_dropout = config.hidden_dropout
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                      dtype=config.torch_dtype)
+
+        # MLP
+        self.mlp = MLP(config, device=device)
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
+    ):
+        # hidden_states: [s, b, h]
+        hidden_states = hidden_states.view(1, -1, 4096)
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, kv_cache = self.self_attention(
+            layernorm_output,
+            attention_mask,
+            rotary_pos_emb,
+            kv_cache=kv_cache,
+            use_cache=use_cache
+        )
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
+        layernorm_input = residual + layernorm_input
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
+        output = residual + output
+
+        return output, kv_cache
+
+
+class GLMTransformer(torch.nn.Module):
+    """Transformer class."""
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(GLMTransformer, self).__init__()
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_layer_norm = config.post_layer_norm
+
+        # Number of layers.
+        self.num_layers = config.num_layers
+
+        # Transformer layers.
+        def build_layer(layer_number):
+            return GLMBlock(config, layer_number, device=device)
+
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
+
+        if self.post_layer_norm:
+            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+            # Final layer norm before output.
+            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                 dtype=config.torch_dtype)
+
+        self.gradient_checkpointing = False
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
+            use_cache: Optional[bool] = True,
+            output_hidden_states: Optional[bool] = False,
+    ):
+        if not kv_caches:
+            kv_caches = [None for _ in range(self.num_layers)]
+        presents = () if use_cache else None
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        all_self_attentions = None
+        all_hidden_states = () if output_hidden_states else None
+        for index in range(self.num_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer = self._get_layer(index)
+            if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer,
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_caches[index],
+                    use_cache,
+                    use_reentrant=False
+                )
+            else:
+                layer_ret = layer(
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_cache=kv_caches[index],
+                    use_cache=use_cache
+                )
+            hidden_states, kv_cache = layer_ret
+            if use_cache:
+                # token by token decoding, use tuple format
+                if kv_caches[0] is not None:
+                    presents = presents + (kv_cache,)
+                # prefilling in decoding, use tensor format to save cuda memory
+                else:
+                    if len(presents) == 0:
+                        presents = kv_cache
+                    else:
+                        presents = torch.cat((presents, kv_cache), dim=0)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        # Final layer norm.
+        if self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states, presents, all_hidden_states, all_self_attentions
+
+
+class ChatGLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    config_class = ChatGLMConfig
+    base_model_prefix = "transformer"
+    _no_split_modules = ["GLMBlock"]
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        return
+
+    def get_masks(self, input_ids, past_key_values, padding_mask=None):
+        batch_size, seq_length = input_ids.shape
+        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
+        full_attention_mask.tril_()
+        past_length = 0
+        if past_key_values:
+            past_length = past_key_values[0][0].shape[2]
+        if past_length:
+            full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
+                                                        device=input_ids.device), full_attention_mask), dim=-1)
+        if padding_mask is not None:
+            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
+        if not past_length and padding_mask is not None:
+            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
+        full_attention_mask = (full_attention_mask < 0.5).bool()
+        full_attention_mask.unsqueeze_(1)
+        return full_attention_mask
+
+    def get_position_ids(self, input_ids, device):
+        batch_size, seq_length = input_ids.shape
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+        return position_ids
+
+    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
+        if not self.supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+
+
+class Embedding(torch.nn.Module):
+    """Language model embeddings."""
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(Embedding, self).__init__()
+
+        self.hidden_size = config.hidden_size
+        # Word embeddings (parallel).
+        self.word_embeddings = nn.Embedding(
+            config.padded_vocab_size,
+            self.hidden_size,
+            dtype=config.torch_dtype,
+            device=device
+        )
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+    def forward(self, input_ids):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+        return embeddings
+
+
+class ChatGLMModel(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        init_kwargs = {}
+        if device is not None:
+            init_kwargs["device"] = device
+        self.embedding = init_method(Embedding, config, **init_kwargs)
+        self.num_layers = config.num_layers
+        self.multi_query_group_num = config.multi_query_group_num
+        self.kv_channels = config.kv_channels
+
+        # Rotary positional embeddings
+        self.seq_length = config.seq_length
+        rotary_dim = (
+            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
+        )
+
+        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio, original_impl=config.original_rope,
+                                              device=device, dtype=config.torch_dtype)
+        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
+        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
+                                        dtype=config.torch_dtype, **init_kwargs)
+
+    def get_input_embeddings(self):
+        return self.embedding.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embedding.word_embeddings = value
+
+    def forward(
+            self,
+            input_ids,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.BoolTensor] = None,
+            full_attention_mask: Optional[torch.BoolTensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, seq_length = input_ids.shape
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embedding(input_ids)
+
+        if full_attention_mask is None:
+            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
+                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
+
+        # Rotary positional embeddings
+        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
+        if position_ids is not None:
+            rotary_pos_emb = rotary_pos_emb[position_ids]
+        else:
+            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
+
+        # Run encoder.
+        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
+            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
+            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
+        )
+        if presents is not None and type(presents) is torch.Tensor:
+            presents = presents.split(1, dim=0)
+            presents = list(presents)
+            presents = [list(x.squeeze(0).split(1, dim=0)) for x in presents]
+            presents = [tuple([x.squeeze(0) for x in y]) for y in presents]
+            presents = tuple(presents)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+
+        self.max_sequence_length = config.max_length
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+        self.config = config
+
+    def _update_model_kwargs_for_generation(
+            self,
+            outputs: ModelOutput,
+            model_kwargs: Dict[str, Any],
+            is_encoder_decoder: bool = False,
+            standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            )
+
+        # update position ids
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id += 1
+            model_kwargs["position_ids"] = torch.cat(
+                [position_ids, new_position_id], dim=-1
+            )
+
+        model_kwargs["is_first_forward"] = False
+        return model_kwargs
+
+    def prepare_inputs_for_generation(
+            self,
+            input_ids: torch.LongTensor,
+            past_key_values: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            is_first_forward: bool = True,
+            **kwargs
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        if position_ids is None:
+            position_ids = self.get_position_ids(input_ids, device=input_ids.device)
+        if not is_first_forward:
+            if past_key_values is not None:
+                position_ids = position_ids[..., -1:]
+                input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "return_last_logit": True,
+            "use_cache": use_cache
+        }
+
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            return_last_logit: Optional[bool] = False,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        if return_last_logit:
+            hidden_states = hidden_states[:, -1:]
+        lm_logits = self.transformer.output_layer(hidden_states)
+
+        loss = None
+        if labels is not None:
+            lm_logits = lm_logits.to(torch.float32)
+
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+            lm_logits = lm_logits.to(hidden_states.dtype)
+            loss = loss.to(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(
+            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+
+        Output shares the same memory storage as `past`.
+        """
+        return tuple(
+            (
+                layer_past[0].index_select(0, beam_idx.to(layer_past[0].device)),
+                layer_past[1].index_select(0, beam_idx.to(layer_past[1].device)),
+            )
+            for layer_past in past
+        )
+
+    def process_response(self, output, history):
+        content = ""
+        history = deepcopy(history)
+        for response in output.split("<|assistant|>"):
+            if "\n" in response:
+                metadata, content = response.split("\n", maxsplit=1)
+            else:
+                metadata, content = "", response
+            if not metadata.strip():
+                content = content.strip()
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                content = content.replace("[[训练时间]]", "2023年")
+            else:
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                if history[0]["role"] == "system" and "tools" in history[0]:
+                    parameters = json.loads(content)
+                    content = {"name": metadata.strip(), "parameters": parameters}
+                else:
+                    content = {"name": metadata.strip(), "content": content}
+        return content, history
+
+    @torch.inference_mode()
+    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
+             max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
+             **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        history.append({"role": role, "content": query})
+        inputs = tokenizer.apply_chat_template(history, add_generation_prompt=True, tokenize=True,
+                                               return_tensors="pt", return_dict=True)
+        inputs = inputs.to(self.device)
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|user|>"),
+                        tokenizer.convert_tokens_to_ids("<|observation|>")]
+        outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+        response = tokenizer.decode(outputs)
+        response, history = self.process_response(response, history)
+        return response, history
+
+    @torch.inference_mode()
+    def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
+                    past_key_values=None, max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
+                    logits_processor=None, return_past_key_values=False, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|user|>"),
+                        tokenizer.convert_tokens_to_ids("<|observation|>")]
+        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        if past_key_values is None:
+            inputs = tokenizer.apply_chat_template(history + [{"role": role, "content": query}],
+                                                   add_generation_prompt=True, tokenize=True, return_tensors="pt",
+                                                   return_dict=True)
+        else:
+            inputs = tokenizer.apply_chat_template([{"role": role, "content": query}], add_special_tokens=False,
+                                                   add_generation_prompt=True, tokenize=True, return_tensors="pt",
+                                                   return_dict=True)
+        inputs = inputs.to(self.device)
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[2]
+            inputs.position_ids += past_length
+            attention_mask = inputs.attention_mask
+            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
+            inputs['attention_mask'] = attention_mask
+        history.append({"role": role, "content": query})
+        for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
+                                            eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
+                                            **gen_kwargs):
+            if return_past_key_values:
+                outputs, past_key_values = outputs
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+            response = tokenizer.decode(outputs)
+            if response and response[-1] != "�":
+                response, new_history = self.process_response(response, history)
+                if return_past_key_values:
+                    yield response, new_history, past_key_values
+                else:
+                    yield response, new_history
+
+    @torch.inference_mode()
+    def stream_generate(
+            self,
+            input_ids,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+            return_past_key_values=False,
+            **kwargs,
+    ):
+        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
+
+        if generation_config is None:
+            generation_config = self.generation_config
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        model_kwargs["use_cache"] = generation_config.use_cache
+        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
+
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            warnings.warn(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+            if not has_default_max_length:
+                logger.warn(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
+                    UserWarning,
+                )
+
+        if input_ids_seq_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+        )
+
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+        logits_warper = self._get_logits_warper(generation_config)
+
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        scores = None
+        while True:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=False,
+                output_hidden_states=False,
+            )
+
+            next_token_logits = outputs.logits[:, -1, :]
+
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            if generation_config.do_sample:
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(probs, dim=-1)
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            unfinished_sequences = unfinished_sequences.mul(
+                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+            )
+            if return_past_key_values:
+                yield input_ids, outputs.past_key_values
+            else:
+                yield input_ids
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                break
+
+
+class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+
+        self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
+        if config.classifier_dropout is not None:
+            self.dropout = nn.Dropout(config.classifier_dropout)
+        else:
+            self.dropout = None
+        self.config = config
+
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            full_attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.LongTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            full_attention_mask=full_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        pooled_hidden_states = hidden_states[-1]
+        if self.dropout is not None:
+            pooled_hidden_states = self.dropout(pooled_hidden_states)
+        logits = self.classifier_head(pooled_hidden_states)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze().float(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits.float(), labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/transformers/llm/export/llm_models/internlm-chat-7b/modeling_internlm.py b/transformers/llm/export/llm_models/internlm-chat-7b/modeling_internlm.py
index af9f5842c..b636e8716 100755
--- a/transformers/llm/export/llm_models/internlm-chat-7b/modeling_internlm.py
+++ b/transformers/llm/export/llm_models/internlm-chat-7b/modeling_internlm.py
@@ -147,8 +147,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
     # sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
     cos = torch.squeeze(cos)  # [seq_len, dim]
     sin = torch.squeeze(sin)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    # cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    # sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
@@ -202,11 +202,12 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
-
+        '''
         query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
@@ -214,7 +215,10 @@ def forward(
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        if rotary_pos_emb is None:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        else:
+            cos, sin = rotary_pos_emb
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         # [bsz, nh, t, hd]
 
@@ -226,7 +230,30 @@ def forward(
         past_key_value = (key_states, value_states) if use_cache else None
 
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        '''
+        #---------------
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)
+        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)
 
+        kv_seq_len = key_states.shape[1]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[1]
+        # rope
+        cos, sin = rotary_pos_emb
+        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
+        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
+        # kv cache
+        if past_key_value is not None:
+            past_key, past_value = past_key_value[0], past_key_value[1]
+            key_states = torch.cat((past_key, key_states), dim=1)
+            value_states = torch.cat((past_value, value_states), dim=1)
+        past_key_value = torch.stack((key_states, value_states))
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.permute([0, 2, 3, 1])
+        value_states = value_states.transpose(1, 2)
+        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
+        #---------------
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
                 f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
@@ -239,7 +266,7 @@ def forward(
                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights + attention_mask
-            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+            # attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
 
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
@@ -281,6 +308,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
@@ -308,6 +336,7 @@ def forward(
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_value=past_key_value,
+            rotary_pos_emb=rotary_pos_emb,
             output_attentions=output_attentions,
             use_cache=use_cache,
         )