From 65ec0ea4062cdb99ec68ebab2abf6747fd7b103b Mon Sep 17 00:00:00 2001 From: xiaying Date: Sat, 15 Jun 2024 15:39:59 +0800 Subject: [PATCH] MNN:Sync: Fix bug for llama2/llama3 attention fuse, refract llm usage --- CMakeLists.txt | 2 + docs/compile/engine.md | 6 +- docs/compile/{tools.md => other.md} | 24 +- docs/index.rst | 10 +- docs/transformers/diffusion.md | 3 + docs/transformers/llm.md | 198 +++ express/Executor.cpp | 32 +- express/Expr.cpp | 5 +- express/RuntimeAttr.hpp | 1 - express/module/Module.cpp | 2 +- include/MNN/Interpreter.hpp | 21 + pymnn/src/util.h | 48 +- .../arm82/asm/arm64/MNNPackedMatMulFP16.S | 2 +- .../low_memory/MNNPackedMatMulFP16_int4.S | 352 ++-- .../low_memory/MNNPackedMatMulFP16_int8.S | 361 ++--- .../MNNPackedMatMulRemainFP16_int4.S | 183 ++- .../MNNPackedMatMulRemainFP16_int8.S | 178 +- source/backend/cpu/CMakeLists.txt | 2 - source/backend/cpu/CPUAttention.cpp | 204 ++- source/backend/cpu/CPUAttention.hpp | 43 +- source/backend/cpu/CPURaster.cpp | 8 +- .../low_memory/MNNPackedMatMulRemain_int4.S | 98 +- .../low_memory/MNNPackedMatMulRemain_int8.S | 69 +- .../arm64/low_memory/MNNPackedMatMul_int4.S | 60 +- .../arm64/low_memory/MNNPackedMatMul_int8.S | 53 +- .../backend/cpu/compute/CommonOptFunction.cpp | 42 +- .../backend/cpu/compute/ConvolutionHybrid.cpp | 30 +- .../compute/DenseConvolutionTiledExecutor.cpp | 177 +- .../cpu/compute/StrassenMatmulComputor.cpp | 3 +- source/backend/cpu/x86_x64/avx/GemmAVX2.cpp | 16 +- .../backend/cpu/x86_x64/avx/GemmFunction.hpp | 928 +++++++++-- source/backend/cpu/x86_x64/sse/GemmCommon.hpp | 21 + .../backend/cpu/x86_x64/sse/GemmFunction.hpp | 135 +- source/backend/cpu/x86_x64/sse/GemmSSE.cpp | 16 +- source/backend/metal/AllShader.cpp | 30 +- source/backend/metal/CMakeLists.txt | 18 - source/backend/metal/MetalAttention.mm | 56 +- source/backend/metal/MetalConvolution1x1.mm | 7 +- .../backend/metal/MetalConvolutionCommon.mm | 57 +- .../metal/shader/MetalConvolution1x1.metal | 99 +- .../opencl/execution/cl/opencl_program.cc | 2 +- source/core/ConvolutionCommon.cpp | 1 + source/core/IDSTEncoder.hpp | 50 +- source/core/Interpreter.cpp | 31 +- source/core/Pipeline.cpp | 4 +- source/core/Pipeline.hpp | 2 +- source/core/Session.cpp | 40 +- source/core/Session.hpp | 4 + source/core/TensorUtils.cpp | 489 +++--- source/core/TensorUtils.hpp | 12 +- source/geometry/GeometryBinary.cpp | 167 +- source/geometry/GeometryComputer.cpp | 84 +- source/geometry/GeometryComputer.hpp | 7 +- source/geometry/GeometryComputerUtils.cpp | 10 +- test.sh | 10 +- test/CommonOpCreator.hpp | 4 +- test/MNNTestSuite.cpp | 34 +- test/core/RegionFuse.cpp | 48 +- test/expr/MatMulTest.cpp | 53 + test/op/ConvInt8Test.cpp | 2 +- test/op/ConvolutionTest.cpp | 113 +- test/op/ZerosLikeTest.cpp | 11 + tools/converter/include/config.hpp | 1 + .../source/common/WeightQuantAndCoding.cpp | 56 +- tools/converter/source/common/cli.cpp | 10 +- .../source/common/convertToStaticModel.cpp | 2 +- .../source/optimizer/merge/FuseAttention.cpp | 29 +- .../source/optimizer/merge/MergeHelpers.cpp | 5 + .../source/optimizer/merge/MergeHelpers.hpp | 1 + .../postconvert/RemoveInvalidCast.cpp | 55 +- tools/cpp/ExprDebug.hpp | 8 +- tools/cpp/ModuleBasic.cpp | 7 + tools/script/apply_gptq.py | 187 +++ transformers/llm/config.json | 9 + transformers/llm/engine/include/llm.hpp | 533 +++--- transformers/llm/engine/include/tokenizer.hpp | 36 +- transformers/llm/engine/llm_demo.cpp | 30 +- transformers/llm/engine/src/llm.cpp | 1092 +++++-------- transformers/llm/engine/src/tokenizer.cpp | 177 +- transformers/llm/export/llm_export.py | 289 +++- .../Baichuan2-7B-Chat/modeling_baichuan.py | 77 +- .../Llama-2-7b-chat-ms/modeling_llama.py | 42 +- .../Llama-3-8B-Instruct/modeling_llama.py | 42 +- .../llm_models/MiniCPM-1.2b/config.json | 0 .../llm_models/MiniCPM-2.4b/config.json | 0 .../llm_models/Qwen-7B-Chat/modeling_qwen.py | 9 +- .../llm_models/Qwen-VL-Chat/modeling_qwen.py | 14 +- .../Qwen1_5-0_5B-Chat/modeling_qwen2.py | 6 +- .../Qwen1_5-1_8B-Chat/modeling_qwen2.py | 6 +- .../Qwen1_5-4B-Chat/modeling_qwen2.py | 6 +- .../Qwen1_5-7B-Chat/modeling_qwen2.py | 6 +- .../Qwen2-0_5B-Instruct/config.json | 31 + .../configuration_qwen2.py | 144 ++ .../Qwen2-0_5B-Instruct/modeling_qwen2.py | 1436 +++++++++++++++++ .../Qwen2-1_5B-Instruct/config.json | 31 + .../configuration_qwen2.py | 144 ++ .../Qwen2-1_5B-Instruct/modeling_qwen2.py | 1436 +++++++++++++++++ .../export/llm_models/Qwen2-1_5B/config.json | 31 + .../Qwen2-1_5B/configuration_qwen2.py | 144 ++ .../llm_models/Qwen2-1_5B/modeling_qwen2.py | 1434 ++++++++++++++++ .../llm_models/Qwen2-7B-Instruct/config.json | 31 + .../Qwen2-7B-Instruct/configuration_qwen2.py | 144 ++ .../Qwen2-7B-Instruct/modeling_qwen2.py | 1436 +++++++++++++++++ .../config.json | 0 .../configuration_llama.py | 0 .../modeling_llama.py | 42 +- .../llm_models/Yi-6B-Chat/modeling_llama.py | 42 +- .../deepseek-llm-7b-chat/modeling_llama.py | 42 +- .../glm-4-9b-chat/modeling_chatglm.py | 1238 ++++++++++++++ .../internlm-chat-7b/modeling_internlm.py | 39 +- 110 files changed, 12586 insertions(+), 2772 deletions(-) rename docs/compile/{tools.md => other.md} (87%) create mode 100644 docs/transformers/diffusion.md create mode 100644 docs/transformers/llm.md create mode 100644 tools/script/apply_gptq.py create mode 100755 transformers/llm/config.json mode change 100755 => 100644 transformers/llm/export/llm_models/MiniCPM-1.2b/config.json mode change 100755 => 100644 transformers/llm/export/llm_models/MiniCPM-2.4b/config.json create mode 100755 transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/config.json create mode 100644 transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/configuration_qwen2.py create mode 100644 transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/modeling_qwen2.py create mode 100755 transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/config.json create mode 100644 transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/configuration_qwen2.py create mode 100644 transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/modeling_qwen2.py create mode 100755 transformers/llm/export/llm_models/Qwen2-1_5B/config.json create mode 100644 transformers/llm/export/llm_models/Qwen2-1_5B/configuration_qwen2.py create mode 100644 transformers/llm/export/llm_models/Qwen2-1_5B/modeling_qwen2.py create mode 100755 transformers/llm/export/llm_models/Qwen2-7B-Instruct/config.json create mode 100644 transformers/llm/export/llm_models/Qwen2-7B-Instruct/configuration_qwen2.py create mode 100644 transformers/llm/export/llm_models/Qwen2-7B-Instruct/modeling_qwen2.py rename transformers/llm/export/llm_models/{TinyLlama-1.1B-Chat => TinyLlama-1_1B-Chat}/config.json (100%) rename transformers/llm/export/llm_models/{TinyLlama-1.1B-Chat => TinyLlama-1_1B-Chat}/configuration_llama.py (100%) rename transformers/llm/export/llm_models/{TinyLlama-1.1B-Chat => TinyLlama-1_1B-Chat}/modeling_llama.py (95%) create mode 100755 transformers/llm/export/llm_models/glm-4-9b-chat/modeling_chatglm.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e012f380..bd2220bc7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,6 +61,8 @@ option(MNN_BUILD_LLM "Build llm library based MNN." OFF) option(MNN_BUILD_DIFFUSION "Build diffusion demo based MNN." OFF) option(MNN_INTERNAL "Build with MNN internal features, such as model authentication, metrics logging" OFF) option(MNN_JNI "Build MNN Jni for java to use" OFF) +option(MNN_SUPPORT_BF16 "Enable MNN's bf16 op" OFF) +option(MNN_LOW_MEMORY "Build MNN support low memory for weight quant model." OFF) IF (OHOS) include($ENV{NODE_PATH}/@ali/tcpkg/tcpkg.cmake) diff --git a/docs/compile/engine.md b/docs/compile/engine.md index 763202078..eb8eb6503 100644 --- a/docs/compile/engine.md +++ b/docs/compile/engine.md @@ -3,14 +3,14 @@ ## Linux/MacOS - 环境要求 - cmake >= 3.10 - - gcc >= 4.9 + - gcc >= 4.9 或者使用 clang - 相关编译选项 - - `MNN_ONEDNN` 是否使用oneDNN库来加速卷积运算 - `MNN_AVX512` 是否使用AVX512指令,需要gcc9以上版本编译 - `MNN_OPENCL` 是否使用OpenCL后端,针对GPU设备 + - `MNN_METAL` 是否使用Metal后端,针对MacOS/iOSGPU设备 - `MNN_VULKAN` 是否使用Vulkan后端,针对GPU设备 - `MNN_CUDA` 是否使用CUDA后端,针对Nivida GPU设备 - - `MNN_TENSORRT` 是否使用TensorRT后端,针对Nivida GPU设备 + - 其他编译选项可自行查看 CMakeLists.txt - 具体步骤 1. 准备工作 (可选,修改 MNN Schema 后需要) ```bash diff --git a/docs/compile/tools.md b/docs/compile/other.md similarity index 87% rename from docs/compile/tools.md rename to docs/compile/other.md index f119c397d..d0209f61b 100644 --- a/docs/compile/tools.md +++ b/docs/compile/other.md @@ -1,4 +1,4 @@ -# 工具模块编译 +# 其他模块编译 ## 模型转换工具 - 相关编译选项 @@ -31,6 +31,28 @@ - `runTrainDemo.out` 运行训练框架demo的入口程序 - `transformer` 训练模型转换器,将推理用的MNN模型转换为执行训练的MNN模型 - `extractForInfer` 从执行训练的MNN模型中提取参数,对应更新推理用的MNN模型 +## 生成式模型 +- 相关编译选项 + - `MNN_BUILD_DIFFUSION` 是否编译扩散模型推理示例 + - `MNN_BUILD_LLM` 是否编译大语言模型推理引擎 + - `MNN_SUPPORT_TRANSFORMER_FUSE` 是否支持`transformer`相关的融合算子,主要加速transformer模型 +- 编译命令 + - 编译扩散模型推理示例 + ```bash + mkdir build && cd build + cmake .. -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_BUILD_DIFFUSION=ON -DMNN_SUPPORT_TRANSFORMER_FUSE=ON + make -j4 + ``` + - 编译大语言模型推理引擎 + ```bash + mkdir build && cd build + cmake .. -DMNN_BUILD_LLM=ON -DMNN_SUPPORT_TRANSFORMER_FUSE=ON + make -j4 + ``` +- 编译产物 + - `libllm.so` 大语言模型推理库 + - `llm_demo` 大语言模型推理示例程序 + - `diffusion_demo` 扩散模型示例程序 ## 测试工具 - 相关编译选项 - `MNN_BUILD_TOOL` 是否编译测试工具 diff --git a/docs/index.rst b/docs/index.rst index ac2730945..8c97f2410 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -31,7 +31,7 @@ compile/cmake compile/engine - compile/tools + compile/other compile/pymnn .. toctree:: @@ -62,6 +62,14 @@ train/finetune train/distl +.. toctree:: + :maxdepth: 1 + :caption: 生成式模型 + :name: transformers + + transformers/diffusion + transformers/llm + .. toctree:: :maxdepth: 1 :caption: 测试工具 diff --git a/docs/transformers/diffusion.md b/docs/transformers/diffusion.md new file mode 100644 index 000000000..da22cb304 --- /dev/null +++ b/docs/transformers/diffusion.md @@ -0,0 +1,3 @@ +# 扩散模型 + +TODO \ No newline at end of file diff --git a/docs/transformers/llm.md b/docs/transformers/llm.md new file mode 100644 index 000000000..ea671993b --- /dev/null +++ b/docs/transformers/llm.md @@ -0,0 +1,198 @@ +# 大语言模型 + +基于MNN开发的LLM推理引擎,支持目前主流的开源LLM模型。该功能分为2部分: +- 模型导出:将torch模型导出为onnx,然后转换为mnn模型;导出tokenizer文件,embedding等文件; +- 模型推理:支持导出的模型推理,支持LLM模型的文本生成; + +## 模型导出 + +`llm_export`是一个llm模型导出工具,能够将llm模型导出为onnx和mnn模型。 + +### 用法 +1. 将需要导出的LLM项目clone到本地,如:Qwen2-0.5B-Instruct +```sh +git clone https://www.modelscope.cn/qwen/Qwen2-0.5B-Instruct.git +``` +3. 执行`llm_export.py`导出模型 +```sh +cd ./transformers/llm/export +# 导出模型,tokenizer和embedding,并导出对应的mnn模型 +python llm_export.py \ + --type Qwen2-0_5B-Instruct \ + --path /path/to/Qwen2-0.5B-Instruct \ + --export \ + --export_token \ + --export_embed --embed_bin \ + --export_mnn +``` +4. 导出产物 +导出产物为: +1. `embeddings_bf16.bin`: 模型的embedding权重二进制文件,推理时使用; +2. `llm_config.json`: 模型的配置信息,推理时使用; +3. `llm.onnx`: 模型的onnx文件,推理时不使用; +4. `tokenizer.txt`: 模型的tokenzier文件,推理时使用; +5. `llm.mnn`: 模型的mnn文件,推理时使用; +6. `llm.mnn.weight`: 模型的mnn权重,推理时使用; +目录结构如下所示: +``` +. +├── onnx +| ├── embeddings_bf16.bin +| ├── llm_config.json +| ├── llm.onnx +| └── tokenizer.txt +└── mnn + ├── llm.mnn + └── llm.mnn.weight +``` + +### 功能 +- 支持将模型完整导出为一个onnx模型,使用`--export` +- 支持将模型分段导出为多个模型,使用`--export_split` +- 支持导出模型的词表到一个文本文件,每行代表一个token;其中token使用base64编码;使用`--export_verbose` +- 支持导出模型的Embedding层为一个onnx模型,使用`--export_embed`,同时支持bf16格式,使用`--embed_bf16` +- 支持分层导出模型的block,使用`--export_blocks`导出全部层;使用`--export_block $id`导出指定层 +- 支持导出模型的lm_head层为一个onnx模型,使用`--export_lm` +- 支持导出多模态模型的visual模型为一个onnx模型,使用`--export_visual` +- 支持对模型进行对话测试,使用`--test $query`会返回llm的回复内容 +- 支持在导出onnx模型后使用onnxruntime对结果一致性进行校验,使用`--export_test` +- 支持将tokenizer导出为文本文件,使用`--export_token` +- 支持将导出的onnx模型转换为mnn模型,默认转换为非对称4bit量化,使用`--export_mnn` +- 指定导出路径使用`--onnx_path`和`--mnn_path` +- 默认会使用onnx-slim对onnx模型进行优化,跳过该步骤使用`--skip_slim` +- 支持合并lora权重后导出,指定lora权重的目录使用`--lora_path` + +### 参数 +``` +usage: llm_export.py [-h] --path PATH + [--type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-1_8B,Qwen-VL-Chat,Qwen1_5-0_5B-Chat,Qwen1_5-1_8B-Chat,Qwen1_5-4B-Chat,Qwen1_5-7B-Chat,Qwen2-1_5B-Instruct,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,Llama-3-8B-Instruct,internlm-chat-7b,TinyLlama-1_1B-Chat,Yi-6B-Chat,deepseek-llm-7b-chat,phi-2,bge-large-zh,lora}] + [--lora_path LORA_PATH] [--onnx_path ONNX_PATH] [--mnn_path MNN_PATH] [--export_mnn] [--export_verbose] [--export_test] [--test TEST] [--export] [--export_split] [--export_token] + [--export_embed] [--export_visual] [--export_lm] [--export_block EXPORT_BLOCK] [--export_blocks] [--embed_bin] [--embed_bf16] [--skip_slim] + +llm_exporter + +options: + -h, --help show this help message and exit + --path PATH path(`str` or `os.PathLike`): + Can be either: + - A string, the *model id* of a pretrained model like `THUDM/chatglm-6b`. [TODO] + - A path to a *directory* clone from repo like `../chatglm-6b`. + --type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-1_8B,Qwen-VL-Chat,Qwen1_5-0_5B-Chat,Qwen1_5-1_8B-Chat,Qwen1_5-4B-Chat,Qwen1_5-7B-Chat,Qwen2-1_5B-Instruct,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,Llama-3-8B-Instruct,internlm-chat-7b,TinyLlama-1_1B-Chat,Yi-6B-Chat,deepseek-llm-7b-chat,phi-2,bge-large-zh,lora} + type(`str`, *optional*): + The pretrain llm model type. + --lora_path LORA_PATH + lora path, defaut is `None` mean not apply lora. + --onnx_path ONNX_PATH + export onnx model path, defaut is `./onnx`. + --mnn_path MNN_PATH export mnn model path, defaut is `./mnn`. + --export_mnn Whether or not to export mnn model after onnx. + --export_verbose Whether or not to export onnx with verbose. + --export_test Whether or not to export onnx with test using onnxruntime. + --test TEST test model inference with query `TEST`. + --export export model to an `onnx` model. + --export_split export model split to some `onnx` models: + - embedding model. + - block models. + - lm_head model. + --export_token export llm tokenizer to a txt file. + --export_embed export llm embedding to an `onnx` model. + --export_visual export llm visual model to an `onnx` model. + --export_lm export llm lm_head to an `onnx` model. + --export_block EXPORT_BLOCK + export llm block [id] to an `onnx` model. + --export_blocks export llm all blocks to `onnx` models. + --embed_bin export embedding weight as bin file with dtype `bfloat16` + --embed_bf16 using `bfloat16` replace `float32` in embedding. + --skip_slim Whether or not to skip onnx-slim. +``` + +## 模型推理 + +### 编译 + +[从源码编译](../compile/tools.html#id4) + +### 使用 +#### 运行时配置 + +##### 运行时文件 +将导出产物中用于模型推理的部分置于同一个文件夹下,添加一个配置文件`config.json`来描述模型名称与推理参数,目录如下: +``` +. +└── model_dir + ├── config.json + ├── embeddings_bf16.bin + ├── llm_config.json + ├── llm.mnn + ├── llm.mnn.weight + └── tokenizer.txt +``` + +##### 配置项 +配置文件支持以下配置: +- 模型文件信息 + - base_dir: 模型文件加载的文件夹目录,默认为config.json的所在目录,或模型所在目录; + - llm_config: `llm_config.json`的实际名称路径为`base_dir + llm_config`,默认为`base_dir + 'config.json'` + - llm_model: `llm.mnn`的实际名称路径为`base_dir + llm_model`,默认为`base_dir + 'llm.mnn'` + - llm_weight: `llm.mnn.weight`的实际名称路径为`base_dir + llm_weight`,默认为`base_dir + 'llm.mnn.weight'` + - block_model: 分段模型时`block_{idx}.mnn`的实际路径为`base_dir + block_model`,默认为`base_dir + 'block_{idx}.mnn'` + - lm_model: 分段模型时`lm.mnn`的实际路径为`base_dir + lm_model`,默认为`base_dir + 'lm.mnn'` + - embedding_model: 当embedding使用模型时,embedding的实际路径为`base_dir + embedding_model`,默认为`base_dir + 'embedding.mnn'` + - embedding_file: 当embedding使用二进制时,embedding的实际路径为`base_dir + embedding_file`,默认为`base_dir + 'embeddings_bf16.bin'` + - tokenizer_file: `tokenizer.txt`的实际名称路径为`base_dir + tokenizer_file`,默认为`base_dir + 'tokenizer.txt'` + - visual_model: 当使用VL模型时,visual_model的实际路径为`base_dir + visual_model`,默认为`base_dir + 'visual.mnn'` +- 推理配置 + - max_new_tokens: 生成时最大token数,默认为`512` +- 硬件配置 + - backend_type: 推理使用硬件后端类型,默认为:`"cpu"` + - thread_num: 推理使用硬件线程数,默认为:`4` + - precision: 推理使用精度策略,默认为:`"low"`,尽量使用`fp16` + - memory: 推理使用内存策略,默认为:`"low"`,开启运行时量化 + +##### 配置文件示例 +- `config.json` + ```json + { + "llm_model": "qwen2-1.5b-int4.mnn", + "llm_weight": "qwen2-1.5b-int4.mnn.weight", + + "backend_type": "cpu", + "thread_num": 4, + "precision": "low", + "memory": "low" + } + ``` +- `llm_config.json` + ```json + { + "hidden_size": 1536, + "layer_nums": 28, + "attention_mask": "float", + "key_value_shape": [ + 2, + 1, + 0, + 2, + 128 + ], + "prompt_template": "<|im_start|>user\n%s<|im_end|>\n<|im_start|>assistant\n", + "is_visual": false, + "is_single": true + } + ``` + +#### 推理用法 +`llm_demo`的用法如下: +``` +# 使用config.json +## 交互式聊天 +./llm_demo model_dir/config.json +## 针对prompt中的每行进行回复 +./llm_demo model_dir/config.json prompt.txt + +# 不使用config.json, 使用默认配置 +## 交互式聊天 +./llm_demo model_dir/llm.mnn +## 针对prompt中的每行进行回复 +./llm_demo model_dir/llm.mnn prompt.txt +``` \ No newline at end of file diff --git a/express/Executor.cpp b/express/Executor.cpp index 93f0fd486..0edb9d6ad 100644 --- a/express/Executor.cpp +++ b/express/Executor.cpp @@ -243,38 +243,10 @@ void Executor::RuntimeManager::destroy(RuntimeManager* rtmgr) { } void Executor::RuntimeManager::setMode(Interpreter::SessionMode mode) { - if (mode == Interpreter::Session_Input_Inside || mode == Interpreter::Session_Input_User) { - mInside->modes.inputMode = mode; - } else if (mode == Interpreter::Session_Output_User || mode == Interpreter::Session_Output_Inside) { - mInside->modes.outputMode = mode; - } else if (mode == Interpreter::Session_Backend_Auto || mode == Interpreter::Session_Backend_Fix) { - mInside->modes.backendMode = mode; - } else if (mode == Interpreter::Session_Debug || mode == Interpreter::Session_Release) { - mInside->modes.callBackMode = mode; - } else if (mode == Interpreter::Session_Resize_Direct || mode == Interpreter::Session_Resize_Defer) { - mInside->modes.resizeMode = mode; - } else if(mode == Interpreter::Session_Memory_Collect || mode == Interpreter::Session_Memory_Cache) { - mInside->modes.memoryUsageMode = mode; - } else if(mode == Interpreter::Session_Codegen_Disable || mode == Interpreter::Session_Codegen_Enable) { - mInside->modes.codegenMode = mode; - } + mInside->modes.setMode(mode); } void Executor::RuntimeManager::setHint(Interpreter::HintMode mode, int value) { - switch (mode) { - case Interpreter::MAX_TUNING_NUMBER: - mInside->modes.maxTuningNumber = value; - break; - case Interpreter::STRICT_CHECK_MODEL: - mInside->checkNetBuffer = value > 0; - break; - case Interpreter::MEM_ALLOCATOR_TYPE: - mInside->modes.memoryAllocatorType = value; - break; - case Interpreter::WINOGRAD_MEMORY_LEVEL: - mInside->modes.winogradMemoryUsed = value; - default: - break; - } + mInside->modes.setHint(mode, value); } bool Executor::RuntimeManager::getInfo(Interpreter::SessionInfoCode code, void* ptr) { // Only support get memory diff --git a/express/Expr.cpp b/express/Expr.cpp index 4ff29a59c..aa664ad24 100644 --- a/express/Expr.cpp +++ b/express/Expr.cpp @@ -372,7 +372,7 @@ VARP Variable::create(EXPRP expr, int index) { res.fix(VARP::CONSTANT); return res; } - // CONTENT Mode + // CONTENT Mode, Use Geometry Computer to Decompress Expr do { if (!(executor->getLazyMode() & Executor::LAZY_CONTENT)) { break; @@ -398,7 +398,8 @@ VARP Variable::create(EXPRP expr, int index) { outputTensors[i] = expr->mInside->mOutputTensors[i]; } auto bn = executor->getAttr()->constantBackend; - GeometryComputer::Context context(bn); + // TODO: Support set mask + GeometryComputer::Context context(Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_ALL, bn); auto geo = GeometryComputer::search(expr->get()->type(), Runtime::Compiler_Loop); CommandBuffer cmd; res = geo->onCompute(expr->get(), inputTensors, outputTensors, context, cmd); diff --git a/express/RuntimeAttr.hpp b/express/RuntimeAttr.hpp index 0aef32824..3272cde95 100644 --- a/express/RuntimeAttr.hpp +++ b/express/RuntimeAttr.hpp @@ -21,7 +21,6 @@ struct RuntimeAttr { // Use for static module to compute flops float mFlops; std::string mExternalFile; - bool checkNetBuffer = true; }; struct ExecutorAttr { std::shared_ptr constantBackend; diff --git a/express/module/Module.cpp b/express/module/Module.cpp index d56c944cd..00b0a63bc 100644 --- a/express/module/Module.cpp +++ b/express/module/Module.cpp @@ -351,7 +351,7 @@ static Module* loadInternal(const std::vector& inputs, const std::v } bool checkMNNBuffer = true; if (nullptr != _rtMgr) { - checkMNNBuffer = _rtMgr->getInside()->checkNetBuffer; + checkMNNBuffer = _rtMgr->getInside()->modes.checkNetBuffer; } if (checkMNNBuffer) { flatbuffers::Verifier verify(buffer, length); diff --git a/include/MNN/Interpreter.hpp b/include/MNN/Interpreter.hpp index 5a6e235fb..16344a52b 100644 --- a/include/MNN/Interpreter.hpp +++ b/include/MNN/Interpreter.hpp @@ -203,7 +203,28 @@ class MNN_PUBLIC Interpreter { MEM_ALLOCATOR_TYPE = 2, // Winograd unit candidates count, default 3. if set 0, will use less unit candidates for less memory at the expense of performance. WINOGRAD_MEMORY_LEVEL = 3, + + // Geometry Compute option, default is 0xFFFF + GEOMETRY_COMPUTE_MASK = 4, + }; + + enum GeometryComputeMask { + // Support Region Fuse + GEOMETRCOMPUTEMASK_FUSEREGION = 1 << 0, + + // Support Region Fuse to input with multi-region, eg: pad + concat + GEOMETRCOMPUTEMASK_FUSEREGION_MULTI = 1 << 1, + + // Use loop instead of raster + compute if possible + GEOMETRCOMPUTEMASK_USELOOP = 1 << 2, + + // Support Geometry Cache, if shape changed, will try recompute, and then run compute if failed + GEOMETRCOMPUTEMASK_OPENCACHE = 1 << 3, + + // Full option open mask, for example, if want to close useloop, can set mask as (GEOMETRCOMPUTEMASK_ALL - GEOMETRCOMPUTEMASK_USELOOP) + GEOMETRCOMPUTEMASK_ALL = 0xFFFF, }; + /** * @brief The API shoud be called before create session. * @param mode Hint type diff --git a/pymnn/src/util.h b/pymnn/src/util.h index f3c855578..bd33cc895 100644 --- a/pymnn/src/util.h +++ b/pymnn/src/util.h @@ -667,23 +667,45 @@ inline bool getScheduleConfig(PyObject* dict, MNN::ScheduleConfig &config) { } config.numThread = (int)toInt(numThread); } - { - //precision - PyObject *obj = PyDict_GetItemString(dict, "precision"); + //power + PyObject *obj = PyDict_GetItemString(dict, "power"); if (obj) { - auto obj_name = object2String(obj); - if (!obj_name.compare("low")) { - MNN_PRINT("MNN use low precision\n"); - backendConfig->precision = MNN::BackendConfig::Precision_Low; + if (isInt(obj)) { + backendConfig->power = (MNN::BackendConfig::PowerMode)toInt(obj); } - if (!obj_name.compare("Low_BF16")) { - MNN_PRINT("MNN use lowBF precision\n"); - backendConfig->precision = MNN::BackendConfig::Precision_Low_BF16; + } + } + { + //memory + PyObject *obj = PyDict_GetItemString(dict, "memory"); + if (obj) { + if (isInt(obj)) { + backendConfig->memory = (MNN::BackendConfig::MemoryMode)toInt(obj); } - if (!obj_name.compare("high")) { - MNN_PRINT("MNN use high precision\n"); - backendConfig->precision = MNN::BackendConfig::Precision_High; + } + } + { + //precision + PyObject *obj = PyDict_GetItemString(dict, "precision"); + if (obj) { + if (isInt(obj)) { + backendConfig->precision = (MNN::BackendConfig::PrecisionMode)toInt(obj); + } else { + // For compability + auto obj_name = object2String(obj); + if (!obj_name.compare("low")) { + MNN_PRINT("MNN use low precision\n"); + backendConfig->precision = MNN::BackendConfig::Precision_Low; + } + if (!obj_name.compare("Low_BF16")) { + MNN_PRINT("MNN use lowBF precision\n"); + backendConfig->precision = MNN::BackendConfig::Precision_Low_BF16; + } + if (!obj_name.compare("high")) { + MNN_PRINT("MNN use high precision\n"); + backendConfig->precision = MNN::BackendConfig::Precision_High; + } } } } diff --git a/source/backend/arm82/asm/arm64/MNNPackedMatMulFP16.S b/source/backend/arm82/asm/arm64/MNNPackedMatMulFP16.S index 7d1caf9f4..f1a462b93 100644 --- a/source/backend/arm82/asm/arm64/MNNPackedMatMulFP16.S +++ b/source/backend/arm82/asm/arm64/MNNPackedMatMulFP16.S @@ -14,7 +14,7 @@ // 8 * 24 MatMul asm_function MNNPackedMatMulFP16 //void MNNPackedMatMulFP16(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, const size_t* parameter, const FLOAT16* postParameters, const FLOAT16* bias); -// x0: C, x1:A, x2:B, x3:parameter, x5: postParameters, x6:bias +// x0: C, x1:A, x2:B, x3:parameter, x4: postParameters, x5:bias stp d14, d15, [sp, #-64]! stp d12, d13, [sp, #16] stp d10, d11, [sp, #32] diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S index 72f7e12bd..118b4f104 100644 --- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S +++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S @@ -11,25 +11,106 @@ .text .align 5 + +.macro FMLA_8 d0, d1, d2, d3, d4, d5, d6, d7, s0, s1 + fmla \d0\().8h, \s0\().8h, \s1\().h[0] + fmla \d1\().8h, \s0\().8h, \s1\().h[1] + fmla \d2\().8h, \s0\().8h, \s1\().h[2] + fmla \d3\().8h, \s0\().8h, \s1\().h[3] + fmla \d4\().8h, \s0\().8h, \s1\().h[4] + fmla \d5\().8h, \s0\().8h, \s1\().h[5] + fmla \d6\().8h, \s0\().8h, \s1\().h[6] + fmla \d7\().8h, \s0\().8h, \s1\().h[7] +.endm + +.macro FMLA_4 d0, d1, d2, d3, s0, s1 + fmla \d0\().8h, \s0\().8h, \s1\().h[0] + fmla \d1\().8h, \s0\().8h, \s1\().h[1] + fmla \d2\().8h, \s0\().8h, \s1\().h[2] + fmla \d3\().8h, \s0\().8h, \s1\().h[3] +.endm + +.macro FMUL_8 d0, d1, d2, d3, d4, d5, d6, d7, s0, s1 + fmul \d0\().8h, \s0\().8h, \s1\().h[0] + fmul \d1\().8h, \s0\().8h, \s1\().h[1] + fmul \d2\().8h, \s0\().8h, \s1\().h[2] + fmul \d3\().8h, \s0\().8h, \s1\().h[3] + fmul \d4\().8h, \s0\().8h, \s1\().h[4] + fmul \d5\().8h, \s0\().8h, \s1\().h[5] + fmul \d6\().8h, \s0\().8h, \s1\().h[6] + fmul \d7\().8h, \s0\().8h, \s1\().h[7] +.endm + +.macro FMUL_4 d0, d1, d2, d3, s0, s1 + fmul \d0\().8h, \s0\().8h, \s1\().h[0] + fmul \d1\().8h, \s0\().8h, \s1\().h[1] + fmul \d2\().8h, \s0\().8h, \s1\().h[2] + fmul \d3\().8h, \s0\().8h, \s1\().h[3] +.endm + +.macro FADD_12 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, z0 + fadd \d0\().8h, \d0\().8h, \z0\().8h + fadd \d1\().8h, \d1\().8h, \z0\().8h + fadd \d2\().8h, \d2\().8h, \z0\().8h + fadd \d3\().8h, \d3\().8h, \z0\().8h + fadd \d4\().8h, \d4\().8h, \z0\().8h + fadd \d5\().8h, \d5\().8h, \z0\().8h + fadd \d6\().8h, \d6\().8h, \z0\().8h + fadd \d7\().8h, \d7\().8h, \z0\().8h + fadd \d8\().8h, \d8\().8h, \z0\().8h + fadd \d9\().8h, \d9\().8h, \z0\().8h + fadd \d10\().8h, \d10\().8h, \z0\().8h + fadd \d11\().8h, \d11\().8h, \z0\().8h +.endm + +.macro FMAX_12 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, z0 + fmax \d0\().8h, \d0\().8h, \z0\().8h + fmax \d1\().8h, \d1\().8h, \z0\().8h + fmax \d2\().8h, \d2\().8h, \z0\().8h + fmax \d3\().8h, \d3\().8h, \z0\().8h + fmax \d4\().8h, \d4\().8h, \z0\().8h + fmax \d5\().8h, \d5\().8h, \z0\().8h + fmax \d6\().8h, \d6\().8h, \z0\().8h + fmax \d7\().8h, \d7\().8h, \z0\().8h + fmax \d8\().8h, \d8\().8h, \z0\().8h + fmax \d9\().8h, \d9\().8h, \z0\().8h + fmax \d10\().8h, \d10\().8h, \z0\().8h + fmax \d11\().8h, \d11\().8h, \z0\().8h +.endm + +.macro FMIN_12 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, z0 + fmin \d0\().8h, \d0\().8h, \z0\().8h + fmin \d1\().8h, \d1\().8h, \z0\().8h + fmin \d2\().8h, \d2\().8h, \z0\().8h + fmin \d3\().8h, \d3\().8h, \z0\().8h + fmin \d4\().8h, \d4\().8h, \z0\().8h + fmin \d5\().8h, \d5\().8h, \z0\().8h + fmin \d6\().8h, \d6\().8h, \z0\().8h + fmin \d7\().8h, \d7\().8h, \z0\().8h + fmin \d8\().8h, \d8\().8h, \z0\().8h + fmin \d9\().8h, \d9\().8h, \z0\().8h + fmin \d10\().8h, \d10\().8h, \z0\().8h + fmin \d11\().8h, \d11\().8h, \z0\().8h +.endm + // 8 * 24 MatMul asm_function MNNPackedMatMulFP16_int4 //void MNNPackedMatMulFP16_int4(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, const size_t* parameter, const FLOAT16* postParameters, const FLOAT16* bias, const FLOAT16* k, const FLOAT16* b); // x0: C, x1:A, x2:B, x3:parameter, x4: postParameters, x5:bias, x6: quant_alpha, x7: quant_bias -stp d14, d15, [sp, #-80]! +stp d14, d15, [sp, #-96]! stp d12, d13, [sp, #16] stp d10, d11, [sp, #32] stp d8, d9, [sp, #48] stp x19, x20, [sp, #64] +stp x21, x22, [sp, #80] ldr x9, [x3, #8] // l ldr x10, [x3, #16] // h ldr x13, [x3, #24] // cStride ldr x11, [x3, #40] // bExtraStride - -// v0, v1, v2: A -// v3, v4: B -// v8 - v31: C +ldr x19, [x3, #48] // blockId +mov x20, x0 add x10, x10, #7 lsr x10, x10, #3 @@ -42,6 +123,7 @@ LH8: sub x14, x13, #128 LoopH: mov x15, x1 + mov x22, x2 ld1 {v4.8h, v5.8h}, [x6], #32 // alpha ld1 {v6.8h, v7.8h}, [x7], #32 // bias subs x12, x9, #1 @@ -62,35 +144,34 @@ LoopH: mov v1.8h, v6.8h fmla v1.8h, v0.8h, v4.8h + cbnz x19, LH8_BLOCK_GT_0 + + LH8_BLOCK0: ld1 {v0.8h}, [x15], #16 - fmul v8.8h, v1.8h, v0.h[0] - fmul v9.8h, v1.8h, v0.h[1] - fmul v10.8h, v1.8h, v0.h[2] - fmul v11.8h, v1.8h, v0.h[3] - fmul v12.8h, v1.8h, v0.h[4] - fmul v13.8h, v1.8h, v0.h[5] - fmul v14.8h, v1.8h, v0.h[6] - fmul v15.8h, v1.8h, v0.h[7] - - fmul v20.8h, v2.8h, v0.h[0] - fmul v21.8h, v2.8h, v0.h[1] - fmul v22.8h, v2.8h, v0.h[2] - fmul v23.8h, v2.8h, v0.h[3] - fmul v24.8h, v2.8h, v0.h[4] - fmul v25.8h, v2.8h, v0.h[5] - fmul v26.8h, v2.8h, v0.h[6] - fmul v27.8h, v2.8h, v0.h[7] + FMUL_8 v8, v9, v10, v11, v12, v13, v14, v15, v1, v0 + FMUL_8 v20, v21, v22, v23, v24, v25, v26, v27, v2, v0 + ld1 {v0.4h}, [x15], #8 + FMUL_4 v16, v17, v18, v19, v1, v0 + FMUL_4 v28, v29, v30, v31, v2, v0 + b LH8_INIT_END + + LH8_BLOCK_GT_0: + ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x20], #64 + ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x20], #64 + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x20], x14 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x20], #64 + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x20], #64 + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x20], x14 + + ld1 {v0.8h}, [x15], #16 + FMLA_8 v8, v9, v10, v11, v12, v13, v14, v15, v1, v0 + FMLA_8 v20, v21, v22, v23, v24, v25, v26, v27, v2, v0 ld1 {v0.4h}, [x15], #8 - fmul v16.8h, v1.8h, v0.h[0] - fmul v17.8h, v1.8h, v0.h[1] - fmul v18.8h, v1.8h, v0.h[2] - fmul v19.8h, v1.8h, v0.h[3] - fmul v28.8h, v2.8h, v0.h[0] - fmul v29.8h, v2.8h, v0.h[1] - fmul v30.8h, v2.8h, v0.h[2] - fmul v31.8h, v2.8h, v0.h[3] + FMLA_4 v16, v17, v18, v19, v1, v0 + FMLA_4 v28, v29, v30, v31, v2, v0 + LH8_INIT_END: beq LoopLEnd LoopL1: @@ -112,34 +193,11 @@ LoopH: fmla v1.8h, v0.8h, v4.8h ld1 {v0.8h}, [x15], #16 - fmla v8.8h, v1.8h, v0.h[0] - fmla v9.8h, v1.8h, v0.h[1] - fmla v10.8h, v1.8h, v0.h[2] - fmla v11.8h, v1.8h, v0.h[3] - fmla v12.8h, v1.8h, v0.h[4] - fmla v13.8h, v1.8h, v0.h[5] - fmla v14.8h, v1.8h, v0.h[6] - fmla v15.8h, v1.8h, v0.h[7] - - fmla v20.8h, v2.8h, v0.h[0] - fmla v21.8h, v2.8h, v0.h[1] - fmla v22.8h, v2.8h, v0.h[2] - fmla v23.8h, v2.8h, v0.h[3] - fmla v24.8h, v2.8h, v0.h[4] - fmla v25.8h, v2.8h, v0.h[5] - fmla v26.8h, v2.8h, v0.h[6] - fmla v27.8h, v2.8h, v0.h[7] - + FMLA_8 v8, v9, v10, v11, v12, v13, v14, v15, v1, v0 + FMLA_8 v20, v21, v22, v23, v24, v25, v26, v27, v2, v0 ld1 {v0.4h}, [x15], #8 - fmla v16.8h, v1.8h, v0.h[0] - fmla v17.8h, v1.8h, v0.h[1] - fmla v18.8h, v1.8h, v0.h[2] - fmla v19.8h, v1.8h, v0.h[3] - fmla v28.8h, v2.8h, v0.h[0] - fmla v29.8h, v2.8h, v0.h[1] - fmla v30.8h, v2.8h, v0.h[2] - fmla v31.8h, v2.8h, v0.h[3] - + FMLA_4 v16, v17, v18, v19, v1, v0 + FMLA_4 v28, v29, v30, v31, v2, v0 bne LoopL1 LoopLEnd: @@ -148,95 +206,24 @@ LoopH: sub x10, x10, #2 cmp x10, #2 - cbz x4, StoreLH8 + cbz x4, StoreLH8 // If postParameter* is nullptr, not the last blockId, just store the intemediate results. - AddBiasLH8: ld1 {v5.8h}, [x4] fcvtn v5.4h, v5.4s dup v6.8h, v5.h[2] // Min Value dup v7.8h, v5.h[3] // Max Value - ld1 {v0.8h, v1.8h}, [x5], #32 - - fmla v8.8h, v0.8h, v5.h[1] - fmla v9.8h, v0.8h, v5.h[1] - fmla v10.8h, v0.8h, v5.h[1] - fmla v11.8h, v0.8h, v5.h[1] - - fmla v12.8h, v0.8h, v5.h[1] - fmla v13.8h, v0.8h, v5.h[1] - fmla v14.8h, v0.8h, v5.h[1] - fmla v15.8h, v0.8h, v5.h[1] - - fmla v16.8h, v0.8h, v5.h[1] - fmla v17.8h, v0.8h, v5.h[1] - fmla v18.8h, v0.8h, v5.h[1] - fmla v19.8h, v0.8h, v5.h[1] - - fmla v20.8h, v1.8h, v5.h[1] - fmla v21.8h, v1.8h, v5.h[1] - fmla v22.8h, v1.8h, v5.h[1] - fmla v23.8h, v1.8h, v5.h[1] - - fmla v24.8h, v1.8h, v5.h[1] - fmla v25.8h, v1.8h, v5.h[1] - fmla v26.8h, v1.8h, v5.h[1] - fmla v27.8h, v1.8h, v5.h[1] - - fmla v28.8h, v1.8h, v5.h[1] - fmla v29.8h, v1.8h, v5.h[1] - fmla v30.8h, v1.8h, v5.h[1] - fmla v31.8h, v1.8h, v5.h[1] + + AddBiasLH8: + cbz x5, PostTreatLH8 + ld1 {v0.8h, v1.8h}, [x5], #32 // gemm bias + FADD_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v0 + FADD_12 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v1 PostTreatLH8: - fmax v8.8h, v8.8h, v6.8h - fmax v9.8h, v9.8h, v6.8h - fmax v10.8h, v10.8h, v6.8h - fmax v11.8h, v11.8h, v6.8h - fmax v12.8h, v12.8h, v6.8h - fmax v13.8h, v13.8h, v6.8h - fmax v14.8h, v14.8h, v6.8h - fmax v15.8h, v15.8h, v6.8h - fmax v16.8h, v16.8h, v6.8h - fmax v17.8h, v17.8h, v6.8h - fmax v18.8h, v18.8h, v6.8h - fmax v19.8h, v19.8h, v6.8h - fmax v20.8h, v20.8h, v6.8h - fmax v21.8h, v21.8h, v6.8h - fmax v22.8h, v22.8h, v6.8h - fmax v23.8h, v23.8h, v6.8h - fmax v24.8h, v24.8h, v6.8h - fmax v25.8h, v25.8h, v6.8h - fmax v26.8h, v26.8h, v6.8h - fmax v27.8h, v27.8h, v6.8h - fmax v28.8h, v28.8h, v6.8h - fmax v29.8h, v29.8h, v6.8h - fmax v30.8h, v30.8h, v6.8h - fmax v31.8h, v31.8h, v6.8h - - fmin v8.8h, v8.8h, v7.8h - fmin v9.8h, v9.8h, v7.8h - fmin v10.8h, v10.8h, v7.8h - fmin v11.8h, v11.8h, v7.8h - fmin v12.8h, v12.8h, v7.8h - fmin v13.8h, v13.8h, v7.8h - fmin v14.8h, v14.8h, v7.8h - fmin v15.8h, v15.8h, v7.8h - fmin v16.8h, v16.8h, v7.8h - fmin v17.8h, v17.8h, v7.8h - fmin v18.8h, v18.8h, v7.8h - fmin v19.8h, v19.8h, v7.8h - fmin v20.8h, v20.8h, v7.8h - fmin v21.8h, v21.8h, v7.8h - fmin v22.8h, v22.8h, v7.8h - fmin v23.8h, v23.8h, v7.8h - fmin v24.8h, v24.8h, v7.8h - fmin v25.8h, v25.8h, v7.8h - fmin v26.8h, v26.8h, v7.8h - fmin v27.8h, v27.8h, v7.8h - fmin v28.8h, v28.8h, v7.8h - fmin v29.8h, v29.8h, v7.8h - fmin v30.8h, v30.8h, v7.8h - fmin v31.8h, v31.8h, v7.8h + FMAX_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v6 + FMAX_12 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v6 + FMIN_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v7 + FMIN_12 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v7 StoreLH8: @@ -253,6 +240,7 @@ LoopH: LH4: cbz x10, End LoopHRemain: + //mov x22, x2 mov x15, x1 subs x12, x9, #1 ld1 {v20.8h}, [x6], #16 // alpha @@ -270,6 +258,15 @@ LoopHRemain: fmla v3.8h, v6.8h, v20.8h ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 + cbnz x19, LH4_BLOCK_GT_0 + + LH4_BLOCK0: + + FMUL_4 v8, v9, v10, v11, v3, v0 + FMUL_4 v12, v13, v14, v15, v3, v1 + FMUL_4 v16, v17, v18, v19, v3, v2 + b LH4_INIT_END + /* fmul v8.8h, v3.8h, v0.h[0] fmul v9.8h, v3.8h, v0.h[1] fmul v10.8h, v3.8h, v0.h[2] @@ -282,7 +279,17 @@ LoopHRemain: fmul v17.8h, v3.8h, v2.h[1] fmul v18.8h, v3.8h, v2.h[2] fmul v19.8h, v3.8h, v2.h[3] + */ + LH4_BLOCK_GT_0: + ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x20], #64 + ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x20], #64 + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x20] + FMLA_4 v8, v9, v10, v11, v3, v0 + FMLA_4 v12, v13, v14, v15, v3, v1 + FMLA_4 v16, v17, v18, v19, v3, v2 + + LH4_INIT_END: beq LoopLREnd LoopLR: @@ -300,86 +307,41 @@ LoopHRemain: fmla v3.8h, v6.8h, v20.8h ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 - fmla v8.8h, v3.8h, v0.h[0] - fmla v9.8h, v3.8h, v0.h[1] - fmla v10.8h, v3.8h, v0.h[2] - fmla v11.8h, v3.8h, v0.h[3] - fmla v12.8h, v3.8h, v1.h[0] - fmla v13.8h, v3.8h, v1.h[1] - fmla v14.8h, v3.8h, v1.h[2] - fmla v15.8h, v3.8h, v1.h[3] - fmla v16.8h, v3.8h, v2.h[0] - fmla v17.8h, v3.8h, v2.h[1] - fmla v18.8h, v3.8h, v2.h[2] - fmla v19.8h, v3.8h, v2.h[3] - + FMLA_4 v8, v9, v10, v11, v3, v0 + FMLA_4 v12, v13, v14, v15, v3, v1 + FMLA_4 v16, v17, v18, v19, v3, v2 bne LoopLR LoopLREnd: cbz x4, StoreLH4 - AddBiasLH4: + ld1 {v5.8h}, [x4] fcvtn v5.4h, v5.4s dup v6.8h, v5.h[2] // Min Value dup v7.8h, v5.h[3] // Max Value + AddBiasLH4: + cbz x5, PostTreatLH4 ld1 {v0.8h}, [x5], #16 - - fmla v8.8h, v0.8h, v5.h[1] - fmla v9.8h, v0.8h, v5.h[1] - fmla v10.8h, v0.8h, v5.h[1] - fmla v11.8h, v0.8h, v5.h[1] - - fmla v12.8h, v0.8h, v5.h[1] - fmla v13.8h, v0.8h, v5.h[1] - fmla v14.8h, v0.8h, v5.h[1] - fmla v15.8h, v0.8h, v5.h[1] - - fmla v16.8h, v0.8h, v5.h[1] - fmla v17.8h, v0.8h, v5.h[1] - fmla v18.8h, v0.8h, v5.h[1] - fmla v19.8h, v0.8h, v5.h[1] + FADD_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v0 PostTreatLH4: - fmax v8.8h, v8.8h, v6.8h - fmax v9.8h, v9.8h, v6.8h - fmax v10.8h, v10.8h, v6.8h - fmax v11.8h, v11.8h, v6.8h - fmax v12.8h, v12.8h, v6.8h - fmax v13.8h, v13.8h, v6.8h - fmax v14.8h, v14.8h, v6.8h - fmax v15.8h, v15.8h, v6.8h - fmax v16.8h, v16.8h, v6.8h - fmax v17.8h, v17.8h, v6.8h - fmax v18.8h, v18.8h, v6.8h - fmax v19.8h, v19.8h, v6.8h - - fmin v8.8h, v8.8h, v7.8h - fmin v9.8h, v9.8h, v7.8h - fmin v10.8h, v10.8h, v7.8h - fmin v11.8h, v11.8h, v7.8h - fmin v12.8h, v12.8h, v7.8h - fmin v13.8h, v13.8h, v7.8h - fmin v14.8h, v14.8h, v7.8h - fmin v15.8h, v15.8h, v7.8h - fmin v16.8h, v16.8h, v7.8h - fmin v17.8h, v17.8h, v7.8h - fmin v18.8h, v18.8h, v7.8h - fmin v19.8h, v19.8h, v7.8h + FMAX_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v6 + FMIN_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v7 StoreLH4: st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64 st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #64 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0] - sub x10, x10, #1 End: +ldp x21, x22, [sp, #80] ldp x19, x20, [sp, #64] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #32] ldp d12, d13, [sp, #16] -ldp d14, d15, [sp], #80 +ldp d14, d15, [sp], #96 ret diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S index 0ca03121b..8f92ac238 100644 --- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S +++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S @@ -11,24 +11,106 @@ .text .align 5 + +.macro FMLA_8 d0, d1, d2, d3, d4, d5, d6, d7, s0, s1 + fmla \d0\().8h, \s0\().8h, \s1\().h[0] + fmla \d1\().8h, \s0\().8h, \s1\().h[1] + fmla \d2\().8h, \s0\().8h, \s1\().h[2] + fmla \d3\().8h, \s0\().8h, \s1\().h[3] + fmla \d4\().8h, \s0\().8h, \s1\().h[4] + fmla \d5\().8h, \s0\().8h, \s1\().h[5] + fmla \d6\().8h, \s0\().8h, \s1\().h[6] + fmla \d7\().8h, \s0\().8h, \s1\().h[7] +.endm + +.macro FMLA_4 d0, d1, d2, d3, s0, s1 + fmla \d0\().8h, \s0\().8h, \s1\().h[0] + fmla \d1\().8h, \s0\().8h, \s1\().h[1] + fmla \d2\().8h, \s0\().8h, \s1\().h[2] + fmla \d3\().8h, \s0\().8h, \s1\().h[3] +.endm + +.macro FMUL_8 d0, d1, d2, d3, d4, d5, d6, d7, s0, s1 + fmul \d0\().8h, \s0\().8h, \s1\().h[0] + fmul \d1\().8h, \s0\().8h, \s1\().h[1] + fmul \d2\().8h, \s0\().8h, \s1\().h[2] + fmul \d3\().8h, \s0\().8h, \s1\().h[3] + fmul \d4\().8h, \s0\().8h, \s1\().h[4] + fmul \d5\().8h, \s0\().8h, \s1\().h[5] + fmul \d6\().8h, \s0\().8h, \s1\().h[6] + fmul \d7\().8h, \s0\().8h, \s1\().h[7] +.endm + +.macro FMUL_4 d0, d1, d2, d3, s0, s1 + fmul \d0\().8h, \s0\().8h, \s1\().h[0] + fmul \d1\().8h, \s0\().8h, \s1\().h[1] + fmul \d2\().8h, \s0\().8h, \s1\().h[2] + fmul \d3\().8h, \s0\().8h, \s1\().h[3] +.endm + +.macro FADD_12 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, z0 + fadd \d0\().8h, \d0\().8h, \z0\().8h + fadd \d1\().8h, \d1\().8h, \z0\().8h + fadd \d2\().8h, \d2\().8h, \z0\().8h + fadd \d3\().8h, \d3\().8h, \z0\().8h + fadd \d4\().8h, \d4\().8h, \z0\().8h + fadd \d5\().8h, \d5\().8h, \z0\().8h + fadd \d6\().8h, \d6\().8h, \z0\().8h + fadd \d7\().8h, \d7\().8h, \z0\().8h + fadd \d8\().8h, \d8\().8h, \z0\().8h + fadd \d9\().8h, \d9\().8h, \z0\().8h + fadd \d10\().8h, \d10\().8h, \z0\().8h + fadd \d11\().8h, \d11\().8h, \z0\().8h +.endm + +.macro FMAX_12 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, z0 + fmax \d0\().8h, \d0\().8h, \z0\().8h + fmax \d1\().8h, \d1\().8h, \z0\().8h + fmax \d2\().8h, \d2\().8h, \z0\().8h + fmax \d3\().8h, \d3\().8h, \z0\().8h + fmax \d4\().8h, \d4\().8h, \z0\().8h + fmax \d5\().8h, \d5\().8h, \z0\().8h + fmax \d6\().8h, \d6\().8h, \z0\().8h + fmax \d7\().8h, \d7\().8h, \z0\().8h + fmax \d8\().8h, \d8\().8h, \z0\().8h + fmax \d9\().8h, \d9\().8h, \z0\().8h + fmax \d10\().8h, \d10\().8h, \z0\().8h + fmax \d11\().8h, \d11\().8h, \z0\().8h +.endm + +.macro FMIN_12 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, z0 + fmin \d0\().8h, \d0\().8h, \z0\().8h + fmin \d1\().8h, \d1\().8h, \z0\().8h + fmin \d2\().8h, \d2\().8h, \z0\().8h + fmin \d3\().8h, \d3\().8h, \z0\().8h + fmin \d4\().8h, \d4\().8h, \z0\().8h + fmin \d5\().8h, \d5\().8h, \z0\().8h + fmin \d6\().8h, \d6\().8h, \z0\().8h + fmin \d7\().8h, \d7\().8h, \z0\().8h + fmin \d8\().8h, \d8\().8h, \z0\().8h + fmin \d9\().8h, \d9\().8h, \z0\().8h + fmin \d10\().8h, \d10\().8h, \z0\().8h + fmin \d11\().8h, \d11\().8h, \z0\().8h +.endm + // 8 * 24 MatMul asm_function MNNPackedMatMulFP16_int8 //void MNNPackedMatMulFP16_int8(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, const size_t* parameter, const FLOAT16* postParameters, const FLOAT16* bias, const FLOAT16* k, const FLOAT16* b); // x0: C, x1:A, x2:B, x3:parameter, x4: postParameters, x5:bias, x6: quant_alpha, x7: quant_bias -stp d14, d15, [sp, #-64]! +stp d14, d15, [sp, #-96]! stp d12, d13, [sp, #16] stp d10, d11, [sp, #32] stp d8, d9, [sp, #48] +stp x19, x20, [sp, #64] +stp x21, x22, [sp, #80] ldr x9, [x3, #8] // l ldr x10, [x3, #16] // h ldr x13, [x3, #24] // cStride ldr x11, [x3, #40] // bExtraStride - -// v0, v1, v2: A -// v3, v4: B -// v8 - v31: C +ldr x19, [x3, #48] // blockId +mov x20, x0 add x10, x10, #7 lsr x10, x10, #3 @@ -41,6 +123,7 @@ LH8: sub x14, x13, #128 LoopH: mov x15, x1 + mov x22, x2 ld1 {v4.8h, v5.8h}, [x6], #32 // alpha ld1 {v6.8h, v7.8h}, [x7], #32 // bias subs x12, x9, #1 @@ -54,35 +137,34 @@ LoopH: mov v1.8h, v6.8h fmla v1.8h, v0.8h, v4.8h + cbnz x19, LH8_BLOCK_GT_0 + + LH8_BLOCK0: ld1 {v0.8h}, [x15], #16 - fmul v8.8h, v1.8h, v0.h[0] - fmul v9.8h, v1.8h, v0.h[1] - fmul v10.8h, v1.8h, v0.h[2] - fmul v11.8h, v1.8h, v0.h[3] - fmul v12.8h, v1.8h, v0.h[4] - fmul v13.8h, v1.8h, v0.h[5] - fmul v14.8h, v1.8h, v0.h[6] - fmul v15.8h, v1.8h, v0.h[7] - - fmul v20.8h, v2.8h, v0.h[0] - fmul v21.8h, v2.8h, v0.h[1] - fmul v22.8h, v2.8h, v0.h[2] - fmul v23.8h, v2.8h, v0.h[3] - fmul v24.8h, v2.8h, v0.h[4] - fmul v25.8h, v2.8h, v0.h[5] - fmul v26.8h, v2.8h, v0.h[6] - fmul v27.8h, v2.8h, v0.h[7] + FMUL_8 v8, v9, v10, v11, v12, v13, v14, v15, v1, v0 + FMUL_8 v20, v21, v22, v23, v24, v25, v26, v27, v2, v0 + ld1 {v0.4h}, [x15], #8 + FMUL_4 v16, v17, v18, v19, v1, v0 + FMUL_4 v28, v29, v30, v31, v2, v0 + b LH8_INIT_END + + LH8_BLOCK_GT_0: + ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x20], #64 + ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x20], #64 + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x20], x14 + + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x20], #64 + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x20], #64 + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x20], x14 + ld1 {v0.8h}, [x15], #16 + FMLA_8 v8, v9, v10, v11, v12, v13, v14, v15, v1, v0 + FMLA_8 v20, v21, v22, v23, v24, v25, v26, v27, v2, v0 ld1 {v0.4h}, [x15], #8 - fmul v16.8h, v1.8h, v0.h[0] - fmul v17.8h, v1.8h, v0.h[1] - fmul v18.8h, v1.8h, v0.h[2] - fmul v19.8h, v1.8h, v0.h[3] - fmul v28.8h, v2.8h, v0.h[0] - fmul v29.8h, v2.8h, v0.h[1] - fmul v30.8h, v2.8h, v0.h[2] - fmul v31.8h, v2.8h, v0.h[3] + FMLA_4 v16, v17, v18, v19, v1, v0 + FMLA_4 v28, v29, v30, v31, v2, v0 + LH8_INIT_END: beq LoopLEnd LoopL1: @@ -98,129 +180,37 @@ LoopH: fmla v1.8h, v0.8h, v4.8h ld1 {v0.8h}, [x15], #16 - fmla v8.8h, v1.8h, v0.h[0] - fmla v9.8h, v1.8h, v0.h[1] - fmla v10.8h, v1.8h, v0.h[2] - fmla v11.8h, v1.8h, v0.h[3] - fmla v12.8h, v1.8h, v0.h[4] - fmla v13.8h, v1.8h, v0.h[5] - fmla v14.8h, v1.8h, v0.h[6] - fmla v15.8h, v1.8h, v0.h[7] - - fmla v20.8h, v2.8h, v0.h[0] - fmla v21.8h, v2.8h, v0.h[1] - fmla v22.8h, v2.8h, v0.h[2] - fmla v23.8h, v2.8h, v0.h[3] - fmla v24.8h, v2.8h, v0.h[4] - fmla v25.8h, v2.8h, v0.h[5] - fmla v26.8h, v2.8h, v0.h[6] - fmla v27.8h, v2.8h, v0.h[7] - + FMLA_8 v8, v9, v10, v11, v12, v13, v14, v15, v1, v0 + FMLA_8 v20, v21, v22, v23, v24, v25, v26, v27, v2, v0 ld1 {v0.4h}, [x15], #8 - fmla v16.8h, v1.8h, v0.h[0] - fmla v17.8h, v1.8h, v0.h[1] - fmla v18.8h, v1.8h, v0.h[2] - fmla v19.8h, v1.8h, v0.h[3] - fmla v28.8h, v2.8h, v0.h[0] - fmla v29.8h, v2.8h, v0.h[1] - fmla v30.8h, v2.8h, v0.h[2] - fmla v31.8h, v2.8h, v0.h[3] - + FMLA_4 v16, v17, v18, v19, v1, v0 + FMLA_4 v28, v29, v30, v31, v2, v0 bne LoopL1 LoopLEnd: add x2, x2, x11 + sub x10, x10, #2 + cmp x10, #2 - cbz x4, StoreLH8 + cbz x4, StoreLH8 // If postParameter* is nullptr, not the last blockId, just store the intemediate results. - AddBiasLH8: - ld1 {v5.8h}, [x4] + ld1 {v5.4s}, [x4] fcvtn v5.4h, v5.4s dup v6.8h, v5.h[2] // Min Value dup v7.8h, v5.h[3] // Max Value - ld1 {v0.8h, v1.8h}, [x5], #32 - - fmla v8.8h, v0.8h, v5.h[1] - fmla v9.8h, v0.8h, v5.h[1] - fmla v10.8h, v0.8h, v5.h[1] - fmla v11.8h, v0.8h, v5.h[1] - - fmla v12.8h, v0.8h, v5.h[1] - fmla v13.8h, v0.8h, v5.h[1] - fmla v14.8h, v0.8h, v5.h[1] - fmla v15.8h, v0.8h, v5.h[1] - - fmla v16.8h, v0.8h, v5.h[1] - fmla v17.8h, v0.8h, v5.h[1] - fmla v18.8h, v0.8h, v5.h[1] - fmla v19.8h, v0.8h, v5.h[1] - - fmla v20.8h, v1.8h, v5.h[1] - fmla v21.8h, v1.8h, v5.h[1] - fmla v22.8h, v1.8h, v5.h[1] - fmla v23.8h, v1.8h, v5.h[1] - - fmla v24.8h, v1.8h, v5.h[1] - fmla v25.8h, v1.8h, v5.h[1] - fmla v26.8h, v1.8h, v5.h[1] - fmla v27.8h, v1.8h, v5.h[1] - - fmla v28.8h, v1.8h, v5.h[1] - fmla v29.8h, v1.8h, v5.h[1] - fmla v30.8h, v1.8h, v5.h[1] - fmla v31.8h, v1.8h, v5.h[1] + + AddBiasLH8: + cbz x5, PostTreatLH8 + ld1 {v0.8h, v1.8h}, [x5], #32 // gemm bias + FADD_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v0 + FADD_12 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v1 PostTreatLH8: - fmax v8.8h, v8.8h, v6.8h - fmax v9.8h, v9.8h, v6.8h - fmax v10.8h, v10.8h, v6.8h - fmax v11.8h, v11.8h, v6.8h - fmax v12.8h, v12.8h, v6.8h - fmax v13.8h, v13.8h, v6.8h - fmax v14.8h, v14.8h, v6.8h - fmax v15.8h, v15.8h, v6.8h - fmax v16.8h, v16.8h, v6.8h - fmax v17.8h, v17.8h, v6.8h - fmax v18.8h, v18.8h, v6.8h - fmax v19.8h, v19.8h, v6.8h - fmax v20.8h, v20.8h, v6.8h - fmax v21.8h, v21.8h, v6.8h - fmax v22.8h, v22.8h, v6.8h - fmax v23.8h, v23.8h, v6.8h - fmax v24.8h, v24.8h, v6.8h - fmax v25.8h, v25.8h, v6.8h - fmax v26.8h, v26.8h, v6.8h - fmax v27.8h, v27.8h, v6.8h - fmax v28.8h, v28.8h, v6.8h - fmax v29.8h, v29.8h, v6.8h - fmax v30.8h, v30.8h, v6.8h - fmax v31.8h, v31.8h, v6.8h - - fmin v8.8h, v8.8h, v7.8h - fmin v9.8h, v9.8h, v7.8h - fmin v10.8h, v10.8h, v7.8h - fmin v11.8h, v11.8h, v7.8h - fmin v12.8h, v12.8h, v7.8h - fmin v13.8h, v13.8h, v7.8h - fmin v14.8h, v14.8h, v7.8h - fmin v15.8h, v15.8h, v7.8h - fmin v16.8h, v16.8h, v7.8h - fmin v17.8h, v17.8h, v7.8h - fmin v18.8h, v18.8h, v7.8h - fmin v19.8h, v19.8h, v7.8h - fmin v20.8h, v20.8h, v7.8h - fmin v21.8h, v21.8h, v7.8h - fmin v22.8h, v22.8h, v7.8h - fmin v23.8h, v23.8h, v7.8h - fmin v24.8h, v24.8h, v7.8h - fmin v25.8h, v25.8h, v7.8h - fmin v26.8h, v26.8h, v7.8h - fmin v27.8h, v27.8h, v7.8h - fmin v28.8h, v28.8h, v7.8h - fmin v29.8h, v29.8h, v7.8h - fmin v30.8h, v30.8h, v7.8h - fmin v31.8h, v31.8h, v7.8h + FMAX_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v6 + FMAX_12 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v6 + FMIN_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v7 + FMIN_12 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v7 StoreLH8: @@ -228,12 +218,9 @@ LoopH: st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #64 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x14 - sub x10, x10, #2 - st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], x14 - cmp x10, #2 bge LoopH @@ -252,6 +239,15 @@ LoopHRemain: fmla v3.8h, v6.8h, v20.8h ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 + cbnz x19, LH4_BLOCK_GT_0 + + LH4_BLOCK0: + + FMUL_4 v8, v9, v10, v11, v3, v0 + FMUL_4 v12, v13, v14, v15, v3, v1 + FMUL_4 v16, v17, v18, v19, v3, v2 + b LH4_INIT_END + /* fmul v8.8h, v3.8h, v0.h[0] fmul v9.8h, v3.8h, v0.h[1] fmul v10.8h, v3.8h, v0.h[2] @@ -264,7 +260,17 @@ LoopHRemain: fmul v17.8h, v3.8h, v2.h[1] fmul v18.8h, v3.8h, v2.h[2] fmul v19.8h, v3.8h, v2.h[3] + */ + + LH4_BLOCK_GT_0: + ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x20], #64 + ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x20], #64 + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x20] + FMLA_4 v8, v9, v10, v11, v3, v0 + FMLA_4 v12, v13, v14, v15, v3, v1 + FMLA_4 v16, v17, v18, v19, v3, v2 + LH4_INIT_END: beq LoopLREnd LoopLR: @@ -276,71 +282,26 @@ LoopHRemain: fmla v3.8h, v6.8h, v20.8h ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 - fmla v8.8h, v3.8h, v0.h[0] - fmla v9.8h, v3.8h, v0.h[1] - fmla v10.8h, v3.8h, v0.h[2] - fmla v11.8h, v3.8h, v0.h[3] - fmla v12.8h, v3.8h, v1.h[0] - fmla v13.8h, v3.8h, v1.h[1] - fmla v14.8h, v3.8h, v1.h[2] - fmla v15.8h, v3.8h, v1.h[3] - fmla v16.8h, v3.8h, v2.h[0] - fmla v17.8h, v3.8h, v2.h[1] - fmla v18.8h, v3.8h, v2.h[2] - fmla v19.8h, v3.8h, v2.h[3] - + FMLA_4 v8, v9, v10, v11, v3, v0 + FMLA_4 v12, v13, v14, v15, v3, v1 + FMLA_4 v16, v17, v18, v19, v3, v2 bne LoopLR LoopLREnd: cbz x4, StoreLH4 - AddBiasLH4: - ld1 {v5.8h}, [x4] + + ld1 {v5.4s}, [x4] fcvtn v5.4h, v5.4s dup v6.8h, v5.h[2] // Min Value dup v7.8h, v5.h[3] // Max Value + AddBiasLH4: + cbz x5, PostTreatLH4 ld1 {v0.8h}, [x5], #16 - - fmla v8.8h, v0.8h, v5.h[1] - fmla v9.8h, v0.8h, v5.h[1] - fmla v10.8h, v0.8h, v5.h[1] - fmla v11.8h, v0.8h, v5.h[1] - - fmla v12.8h, v0.8h, v5.h[1] - fmla v13.8h, v0.8h, v5.h[1] - fmla v14.8h, v0.8h, v5.h[1] - fmla v15.8h, v0.8h, v5.h[1] - - fmla v16.8h, v0.8h, v5.h[1] - fmla v17.8h, v0.8h, v5.h[1] - fmla v18.8h, v0.8h, v5.h[1] - fmla v19.8h, v0.8h, v5.h[1] + FADD_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v0 PostTreatLH4: - fmax v8.8h, v8.8h, v6.8h - fmax v9.8h, v9.8h, v6.8h - fmax v10.8h, v10.8h, v6.8h - fmax v11.8h, v11.8h, v6.8h - fmax v12.8h, v12.8h, v6.8h - fmax v13.8h, v13.8h, v6.8h - fmax v14.8h, v14.8h, v6.8h - fmax v15.8h, v15.8h, v6.8h - fmax v16.8h, v16.8h, v6.8h - fmax v17.8h, v17.8h, v6.8h - fmax v18.8h, v18.8h, v6.8h - fmax v19.8h, v19.8h, v6.8h - - fmin v8.8h, v8.8h, v7.8h - fmin v9.8h, v9.8h, v7.8h - fmin v10.8h, v10.8h, v7.8h - fmin v11.8h, v11.8h, v7.8h - fmin v12.8h, v12.8h, v7.8h - fmin v13.8h, v13.8h, v7.8h - fmin v14.8h, v14.8h, v7.8h - fmin v15.8h, v15.8h, v7.8h - fmin v16.8h, v16.8h, v7.8h - fmin v17.8h, v17.8h, v7.8h - fmin v18.8h, v18.8h, v7.8h - fmin v19.8h, v19.8h, v7.8h + FMAX_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v6 + FMIN_12 v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v7 StoreLH4: @@ -350,10 +311,12 @@ LoopHRemain: End: +ldp x21, x22, [sp, #80] +ldp x19, x20, [sp, #64] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #32] ldp d12, d13, [sp, #16] -ldp d14, d15, [sp], #64 +ldp d14, d15, [sp], #96 ret diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S index 0c93af680..3949f7414 100644 --- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S +++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S @@ -39,10 +39,10 @@ ldr x10, [x4, #16] // h ldr x7, [x4, #24] // cStride ldr x19, [x4, #40] // bExtraStride +ldr x26, [x4, #48] // blockId add x10, x10, #7 lsr x10, x10, #3 - cbz x5, Start ld1 {v5.4s}, [x5] fcvtn v5.4h, v5.4s @@ -63,6 +63,7 @@ LoopE8: mov x13, x2 mov x14, x22 mov x25, x23 + LH8: cmp x8, #2 @@ -90,6 +91,8 @@ LoopE8: fmla v4.8h, v2.8h, v13.8h ld1 {v0.8h}, [x15], x11 + cbnz x26, LE8H8_BLOCK_GT_0 + fmul v16.8h, v3.8h, v0.h[0] fmul v17.8h, v3.8h, v0.h[1] fmul v18.8h, v3.8h, v0.h[2] @@ -109,6 +112,38 @@ LoopE8: fmul v29.8h, v4.8h, v0.h[5] fmul v30.8h, v4.8h, v0.h[6] fmul v31.8h, v4.8h, v0.h[7] + b LE8H8_INIT_END + + LE8H8_BLOCK_GT_0: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x24 + + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0] + sub x0, x0, #128 + sub x0, x0, x24 + + fmla v16.8h, v3.8h, v0.h[0] + fmla v17.8h, v3.8h, v0.h[1] + fmla v18.8h, v3.8h, v0.h[2] + fmla v19.8h, v3.8h, v0.h[3] + + fmla v20.8h, v4.8h, v0.h[0] + fmla v21.8h, v4.8h, v0.h[1] + fmla v22.8h, v4.8h, v0.h[2] + fmla v23.8h, v4.8h, v0.h[3] + + fmla v24.8h, v3.8h, v0.h[4] + fmla v25.8h, v3.8h, v0.h[5] + fmla v26.8h, v3.8h, v0.h[6] + fmla v27.8h, v3.8h, v0.h[7] + + fmla v28.8h, v4.8h, v0.h[4] + fmla v29.8h, v4.8h, v0.h[5] + fmla v30.8h, v4.8h, v0.h[6] + fmla v31.8h, v4.8h, v0.h[7] + + LE8H8_INIT_END: beq LoopLEnd LoopL: @@ -156,30 +191,30 @@ LoopE8: add x13, x13, x19 sub x8, x8, #2 - cbz x5, StoreLH8 AddBiasLH8: + cbz x20, PostTreatLH8 ld1 {v0.8h, v1.8h}, [x20], #32 - fmla v16.8h, v0.8h, v5.h[1] - fmla v17.8h, v0.8h, v5.h[1] - fmla v18.8h, v0.8h, v5.h[1] - fmla v19.8h, v0.8h, v5.h[1] + fadd v16.8h, v0.8h, v16.8h + fadd v17.8h, v0.8h, v17.8h + fadd v18.8h, v0.8h, v18.8h + fadd v19.8h, v0.8h, v19.8h - fmla v20.8h, v1.8h, v5.h[1] - fmla v21.8h, v1.8h, v5.h[1] - fmla v22.8h, v1.8h, v5.h[1] - fmla v23.8h, v1.8h, v5.h[1] + fadd v20.8h, v1.8h, v20.8h + fadd v21.8h, v1.8h, v21.8h + fadd v22.8h, v1.8h, v22.8h + fadd v23.8h, v1.8h, v23.8h - fmla v24.8h, v0.8h, v5.h[1] - fmla v25.8h, v0.8h, v5.h[1] - fmla v26.8h, v0.8h, v5.h[1] - fmla v27.8h, v0.8h, v5.h[1] + fadd v24.8h, v0.8h, v24.8h + fadd v25.8h, v0.8h, v25.8h + fadd v26.8h, v0.8h, v26.8h + fadd v27.8h, v0.8h, v27.8h - fmla v28.8h, v1.8h, v5.h[1] - fmla v29.8h, v1.8h, v5.h[1] - fmla v30.8h, v1.8h, v5.h[1] - fmla v31.8h, v1.8h, v5.h[1] + fadd v28.8h, v1.8h, v28.8h + fadd v29.8h, v1.8h, v29.8h + fadd v30.8h, v1.8h, v30.8h + fadd v31.8h, v1.8h, v31.8h PostTreatLH8: fmax v16.8h, v16.8h, v6.8h @@ -245,6 +280,8 @@ LoopE8: fmla v3.8h, v1.8h, v12.8h ld1 {v0.8h}, [x15], x11 + cbnz x26, LE8H4_BLOCK_GT_0 + fmul v16.8h, v3.8h, v0.h[0] fmul v17.8h, v3.8h, v0.h[1] fmul v18.8h, v3.8h, v0.h[2] @@ -253,6 +290,22 @@ LoopE8: fmul v21.8h, v3.8h, v0.h[5] fmul v22.8h, v3.8h, v0.h[6] fmul v23.8h, v3.8h, v0.h[7] + b LE8H4_INIT_END + + LE8H4_BLOCK_GT_0: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0] + fmla v16.8h, v3.8h, v0.h[0] + fmla v17.8h, v3.8h, v0.h[1] + fmla v18.8h, v3.8h, v0.h[2] + fmla v19.8h, v3.8h, v0.h[3] + sub x0, x0, #64 + + fmla v20.8h, v3.8h, v0.h[4] + fmla v21.8h, v3.8h, v0.h[5] + fmla v22.8h, v3.8h, v0.h[6] + fmla v23.8h, v3.8h, v0.h[7] + LE8H4_INIT_END: beq LoopLREnd LoopLR: @@ -283,17 +336,18 @@ LoopE8: cbz x5, StoreLH8x4 AddBiasLH8x4: + cbz x20, PostTreatLH8x4 ld1 {v0.8h}, [x20] - fmla v16.8h, v0.8h, v5.h[1] - fmla v17.8h, v0.8h, v5.h[1] - fmla v18.8h, v0.8h, v5.h[1] - fmla v19.8h, v0.8h, v5.h[1] + fadd v16.8h, v16.8h, v0.8h + fadd v17.8h, v17.8h, v0.8h + fadd v18.8h, v18.8h, v0.8h + fadd v19.8h, v19.8h, v0.8h - fmla v20.8h, v0.8h, v5.h[1] - fmla v21.8h, v0.8h, v5.h[1] - fmla v22.8h, v0.8h, v5.h[1] - fmla v23.8h, v0.8h, v5.h[1] + fadd v20.8h, v20.8h, v0.8h + fadd v21.8h, v21.8h, v0.8h + fadd v22.8h, v22.8h, v0.8h + fadd v23.8h, v23.8h, v0.8h PostTreatLH8x4: fmax v16.8h, v16.8h, v6.8h @@ -362,6 +416,8 @@ blt E1 fmla v4.8h, v2.8h, v13.8h ld1 {v0.4h}, [x15], x11 + cbnz x26, LE4H8_BLOCK_GT_0 + fmul v16.8h, v3.8h, v0.h[0] fmul v17.8h, v3.8h, v0.h[1] fmul v18.8h, v3.8h, v0.h[2] @@ -371,7 +427,23 @@ blt E1 fmul v21.8h, v4.8h, v0.h[1] fmul v22.8h, v4.8h, v0.h[2] fmul v23.8h, v4.8h, v0.h[3] + b LE4H8_INIT_END + LE4H8_BLOCK_GT_0: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x7 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0] + fmla v16.8h, v3.8h, v0.h[0] + fmla v17.8h, v3.8h, v0.h[1] + fmla v18.8h, v3.8h, v0.h[2] + fmla v19.8h, v3.8h, v0.h[3] + sub x0, x0, x7 + + fmla v20.8h, v4.8h, v0.h[0] + fmla v21.8h, v4.8h, v0.h[1] + fmla v22.8h, v4.8h, v0.h[2] + fmla v23.8h, v4.8h, v0.h[3] + + LE4H8_INIT_END: beq E4LoopLEnd E4LoopL: @@ -415,17 +487,18 @@ blt E1 cbz x5, StoreLH4x8 AddBiasLH4x8: + cbz x20, PostTreatLH4x8 ld1 {v0.8h, v1.8h}, [x20], #32 - fmla v16.8h, v0.8h, v5.h[1] - fmla v17.8h, v0.8h, v5.h[1] - fmla v18.8h, v0.8h, v5.h[1] - fmla v19.8h, v0.8h, v5.h[1] + fadd v16.8h, v0.8h, v16.8h + fadd v17.8h, v0.8h, v17.8h + fadd v18.8h, v0.8h, v18.8h + fadd v19.8h, v0.8h, v19.8h - fmla v20.8h, v1.8h, v5.h[1] - fmla v21.8h, v1.8h, v5.h[1] - fmla v22.8h, v1.8h, v5.h[1] - fmla v23.8h, v1.8h, v5.h[1] + fadd v20.8h, v1.8h, v20.8h + fadd v21.8h, v1.8h, v21.8h + fadd v22.8h, v1.8h, v22.8h + fadd v23.8h, v1.8h, v23.8h PostTreatLH4x8: fmax v16.8h, v16.8h, v6.8h @@ -472,11 +545,21 @@ blt E1 fmla v3.8h, v1.8h, v12.8h ld1 {v0.4h}, [x15], x11 + cbnz x26, LE4H4_BLOCK_GT_0 fmul v16.8h, v3.8h, v0.h[0] fmul v17.8h, v3.8h, v0.h[1] fmul v18.8h, v3.8h, v0.h[2] fmul v19.8h, v3.8h, v0.h[3] + b LE4H4_INIT_END + + LE4H4_BLOCK_GT_0: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0] + fmla v16.8h, v3.8h, v0.h[0] + fmla v17.8h, v3.8h, v0.h[1] + fmla v18.8h, v3.8h, v0.h[2] + fmla v19.8h, v3.8h, v0.h[3] + LE4H4_INIT_END: beq E4LoopLREnd E4LoopLR: @@ -502,12 +585,13 @@ blt E1 cbz x5, StoreLH4x4 AddBiasLH4x4: + cbz x20, PostTreatLH4x4 ld1 {v0.8h}, [x20] - fmla v16.8h, v0.8h, v5.h[1] - fmla v17.8h, v0.8h, v5.h[1] - fmla v18.8h, v0.8h, v5.h[1] - fmla v19.8h, v0.8h, v5.h[1] + fadd v16.8h, v16.8h, v0.8h + fadd v17.8h, v17.8h, v0.8h + fadd v18.8h, v18.8h, v0.8h + fadd v19.8h, v19.8h, v0.8h PostTreatLH4x4: @@ -569,9 +653,19 @@ LoopE1: fmla v4.8h, v2.8h, v13.8h ld1 {v0.h}[0], [x15], x11 + cbnz x26, LE1H8_BLOCK_GT_0 fmul v16.8h, v3.8h, v0.h[0] fmul v20.8h, v4.8h, v0.h[0] + b LE1H8_INIT_END + + LE1H8_BLOCK_GT_0: + ld1 {v16.8h}, [x0], x7 + ld1 {v20.8h}, [x0] + sub x0, x0, x7 + fmla v16.8h, v3.8h, v0.h[0] + fmla v20.8h, v4.8h, v0.h[0] + LE1H8_INIT_END: beq E1LoopLEnd E1LoopL: @@ -606,10 +700,11 @@ LoopE1: cbz x5, StoreLH1x8 AddBiasLH1x8: + cbz x20, PostTreatLH1x8 ld1 {v0.8h, v1.8h}, [x20], #32 - fmla v16.8h, v0.8h, v5.h[1] - fmla v20.8h, v1.8h, v5.h[1] + fadd v16.8h, v0.8h, v16.8h + fadd v20.8h, v1.8h, v20.8h PostTreatLH1x8: fmax v16.8h, v16.8h, v6.8h @@ -641,10 +736,15 @@ LoopE1: scvtf v1.8h, v1.8h mov v3.8h, v14.8h fmla v3.8h, v1.8h, v12.8h - ld1 {v0.h}[0], [x15], x11 + cbnz x26, LE1H4_BLOCK_GT_0 fmul v16.8h, v3.8h, v0.h[0] + b LE1H4_INIT_END + LE1H4_BLOCK_GT_0: + ld1 {v16.8h}, [x0] + fmla v16.8h, v3.8h, v0.h[0] + LE1H4_INIT_END: beq E1LoopLREnd E1LoopLR: @@ -667,6 +767,7 @@ LoopE1: cbz x5, StoreLH1x4 AddBiasLH1x4: + cbz x20, PostTreatLH1x4 ld1 {v0.8h}, [x20] fmla v16.8h, v0.8h, v5.h[1] diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S index 0760af42b..f73046ec0 100644 --- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S +++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S @@ -36,10 +36,10 @@ ldr x10, [x4, #16] // h ldr x7, [x4, #24] // cStride ldr x19, [x4, #40] // bExtraStride +ldr x26, [x4, #48] // blockId add x10, x10, #7 lsr x10, x10, #3 - cbz x5, Start ld1 {v5.4s}, [x5] fcvtn v5.4h, v5.4s @@ -81,6 +81,8 @@ LoopE8: fmla v4.8h, v2.8h, v13.8h ld1 {v0.8h}, [x15], x11 + cbnz x26, LE8H8_BLOCK_GT_0 + fmul v16.8h, v3.8h, v0.h[0] fmul v17.8h, v3.8h, v0.h[1] fmul v18.8h, v3.8h, v0.h[2] @@ -100,6 +102,35 @@ LoopE8: fmul v29.8h, v4.8h, v0.h[5] fmul v30.8h, v4.8h, v0.h[6] fmul v31.8h, v4.8h, v0.h[7] + b LE8H8_INIT_END + LE8H8_BLOCK_GT_0: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x24 + + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0] + fmla v16.8h, v3.8h, v0.h[0] + fmla v17.8h, v3.8h, v0.h[1] + fmla v18.8h, v3.8h, v0.h[2] + fmla v19.8h, v3.8h, v0.h[3] + sub x0, x0, #128 + sub x0, x0, x24 + + fmla v20.8h, v4.8h, v0.h[0] + fmla v21.8h, v4.8h, v0.h[1] + fmla v22.8h, v4.8h, v0.h[2] + fmla v23.8h, v4.8h, v0.h[3] + + fmla v24.8h, v3.8h, v0.h[4] + fmla v25.8h, v3.8h, v0.h[5] + fmla v26.8h, v3.8h, v0.h[6] + fmla v27.8h, v3.8h, v0.h[7] + + fmla v28.8h, v4.8h, v0.h[4] + fmla v29.8h, v4.8h, v0.h[5] + fmla v30.8h, v4.8h, v0.h[6] + fmla v31.8h, v4.8h, v0.h[7] + LE8H8_INIT_END: beq LoopLEnd LoopL: @@ -141,30 +172,30 @@ LoopE8: add x13, x13, x19 sub x8, x8, #2 - cbz x5, StoreLH8 AddBiasLH8: + cbz x20, PostTreatLH8 ld1 {v0.8h, v1.8h}, [x20], #32 - fmla v16.8h, v0.8h, v5.h[1] - fmla v17.8h, v0.8h, v5.h[1] - fmla v18.8h, v0.8h, v5.h[1] - fmla v19.8h, v0.8h, v5.h[1] + fadd v16.8h, v0.8h, v16.8h + fadd v17.8h, v0.8h, v17.8h + fadd v18.8h, v0.8h, v18.8h + fadd v19.8h, v0.8h, v19.8h - fmla v20.8h, v1.8h, v5.h[1] - fmla v21.8h, v1.8h, v5.h[1] - fmla v22.8h, v1.8h, v5.h[1] - fmla v23.8h, v1.8h, v5.h[1] + fadd v20.8h, v1.8h, v20.8h + fadd v21.8h, v1.8h, v21.8h + fadd v22.8h, v1.8h, v22.8h + fadd v23.8h, v1.8h, v23.8h - fmla v24.8h, v0.8h, v5.h[1] - fmla v25.8h, v0.8h, v5.h[1] - fmla v26.8h, v0.8h, v5.h[1] - fmla v27.8h, v0.8h, v5.h[1] + fadd v24.8h, v0.8h, v24.8h + fadd v25.8h, v0.8h, v25.8h + fadd v26.8h, v0.8h, v26.8h + fadd v27.8h, v0.8h, v27.8h - fmla v28.8h, v1.8h, v5.h[1] - fmla v29.8h, v1.8h, v5.h[1] - fmla v30.8h, v1.8h, v5.h[1] - fmla v31.8h, v1.8h, v5.h[1] + fadd v28.8h, v1.8h, v28.8h + fadd v29.8h, v1.8h, v29.8h + fadd v30.8h, v1.8h, v30.8h + fadd v31.8h, v1.8h, v31.8h PostTreatLH8: fmax v16.8h, v16.8h, v6.8h @@ -226,6 +257,8 @@ LoopE8: fmla v3.8h, v1.8h, v12.8h ld1 {v0.8h}, [x15], x11 + cbnz x26, LE8H4_BLOCK_GT_0 + fmul v16.8h, v3.8h, v0.h[0] fmul v17.8h, v3.8h, v0.h[1] fmul v18.8h, v3.8h, v0.h[2] @@ -234,6 +267,22 @@ LoopE8: fmul v21.8h, v3.8h, v0.h[5] fmul v22.8h, v3.8h, v0.h[6] fmul v23.8h, v3.8h, v0.h[7] + b LE8H4_INIT_END + + LE8H4_BLOCK_GT_0: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0] + fmla v16.8h, v3.8h, v0.h[0] + fmla v17.8h, v3.8h, v0.h[1] + fmla v18.8h, v3.8h, v0.h[2] + fmla v19.8h, v3.8h, v0.h[3] + sub x0, x0, #64 + + fmla v20.8h, v3.8h, v0.h[4] + fmla v21.8h, v3.8h, v0.h[5] + fmla v22.8h, v3.8h, v0.h[6] + fmla v23.8h, v3.8h, v0.h[7] + LE8H4_INIT_END: beq LoopLREnd LoopLR: @@ -260,17 +309,18 @@ LoopE8: cbz x5, StoreLH8x4 AddBiasLH8x4: + cbz x20, PostTreatLH8x4 ld1 {v0.8h}, [x20] - fmla v16.8h, v0.8h, v5.h[1] - fmla v17.8h, v0.8h, v5.h[1] - fmla v18.8h, v0.8h, v5.h[1] - fmla v19.8h, v0.8h, v5.h[1] + fadd v16.8h, v16.8h, v0.8h + fadd v17.8h, v17.8h, v0.8h + fadd v18.8h, v18.8h, v0.8h + fadd v19.8h, v19.8h, v0.8h - fmla v20.8h, v0.8h, v5.h[1] - fmla v21.8h, v0.8h, v5.h[1] - fmla v22.8h, v0.8h, v5.h[1] - fmla v23.8h, v0.8h, v5.h[1] + fadd v20.8h, v20.8h, v0.8h + fadd v21.8h, v21.8h, v0.8h + fadd v22.8h, v22.8h, v0.8h + fadd v23.8h, v23.8h, v0.8h PostTreatLH8x4: fmax v16.8h, v16.8h, v6.8h @@ -333,6 +383,8 @@ blt E1 fmla v4.8h, v2.8h, v13.8h ld1 {v0.4h}, [x15], x11 + cbnz x26, LE4H8_BLOCK_GT_0 + fmul v16.8h, v3.8h, v0.h[0] fmul v17.8h, v3.8h, v0.h[1] fmul v18.8h, v3.8h, v0.h[2] @@ -342,7 +394,23 @@ blt E1 fmul v21.8h, v4.8h, v0.h[1] fmul v22.8h, v4.8h, v0.h[2] fmul v23.8h, v4.8h, v0.h[3] + b LE4H8_INIT_END + LE4H8_BLOCK_GT_0: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x7 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0] + fmla v16.8h, v3.8h, v0.h[0] + fmla v17.8h, v3.8h, v0.h[1] + fmla v18.8h, v3.8h, v0.h[2] + fmla v19.8h, v3.8h, v0.h[3] + sub x0, x0, x7 + + fmla v20.8h, v4.8h, v0.h[0] + fmla v21.8h, v4.8h, v0.h[1] + fmla v22.8h, v4.8h, v0.h[2] + fmla v23.8h, v4.8h, v0.h[3] + + LE4H8_INIT_END: beq E4LoopLEnd E4LoopL: @@ -379,17 +447,18 @@ blt E1 cbz x5, StoreLH4x8 AddBiasLH4x8: + cbz x20, PostTreatLH4x8 ld1 {v0.8h, v1.8h}, [x20], #32 - fmla v16.8h, v0.8h, v5.h[1] - fmla v17.8h, v0.8h, v5.h[1] - fmla v18.8h, v0.8h, v5.h[1] - fmla v19.8h, v0.8h, v5.h[1] + fadd v16.8h, v0.8h, v16.8h + fadd v17.8h, v0.8h, v17.8h + fadd v18.8h, v0.8h, v18.8h + fadd v19.8h, v0.8h, v19.8h - fmla v20.8h, v1.8h, v5.h[1] - fmla v21.8h, v1.8h, v5.h[1] - fmla v22.8h, v1.8h, v5.h[1] - fmla v23.8h, v1.8h, v5.h[1] + fadd v20.8h, v1.8h, v20.8h + fadd v21.8h, v1.8h, v21.8h + fadd v22.8h, v1.8h, v22.8h + fadd v23.8h, v1.8h, v23.8h PostTreatLH4x8: fmax v16.8h, v16.8h, v6.8h @@ -432,11 +501,21 @@ blt E1 fmla v3.8h, v1.8h, v12.8h ld1 {v0.4h}, [x15], x11 + cbnz x26, LE4H4_BLOCK_GT_0 fmul v16.8h, v3.8h, v0.h[0] fmul v17.8h, v3.8h, v0.h[1] fmul v18.8h, v3.8h, v0.h[2] fmul v19.8h, v3.8h, v0.h[3] + b LE4H4_INIT_END + + LE4H4_BLOCK_GT_0: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0] + fmla v16.8h, v3.8h, v0.h[0] + fmla v17.8h, v3.8h, v0.h[1] + fmla v18.8h, v3.8h, v0.h[2] + fmla v19.8h, v3.8h, v0.h[3] + LE4H4_INIT_END: beq E4LoopLREnd E4LoopLR: @@ -458,12 +537,13 @@ blt E1 cbz x5, StoreLH4x4 AddBiasLH4x4: + cbz x20, PostTreatLH4x4 ld1 {v0.8h}, [x20] - fmla v16.8h, v0.8h, v5.h[1] - fmla v17.8h, v0.8h, v5.h[1] - fmla v18.8h, v0.8h, v5.h[1] - fmla v19.8h, v0.8h, v5.h[1] + fadd v16.8h, v16.8h, v0.8h + fadd v17.8h, v17.8h, v0.8h + fadd v18.8h, v18.8h, v0.8h + fadd v19.8h, v19.8h, v0.8h PostTreatLH4x4: @@ -519,9 +599,19 @@ LoopE1: fmla v4.8h, v2.8h, v13.8h ld1 {v0.h}[0], [x15], x11 + cbnz x26, LE1H8_BLOCK_GT_0 fmul v16.8h, v3.8h, v0.h[0] fmul v20.8h, v4.8h, v0.h[0] + b LE1H8_INIT_END + + LE1H8_BLOCK_GT_0: + ld1 {v16.8h}, [x0], x7 + ld1 {v20.8h}, [x0] + sub x0, x0, x7 + fmla v16.8h, v3.8h, v0.h[0] + fmla v20.8h, v4.8h, v0.h[0] + LE1H8_INIT_END: beq E1LoopLEnd E1LoopL: @@ -550,10 +640,11 @@ LoopE1: cbz x5, StoreLH1x8 AddBiasLH1x8: + cbz x20, PostTreatLH1x8 ld1 {v0.8h, v1.8h}, [x20], #32 - fmla v16.8h, v0.8h, v5.h[1] - fmla v20.8h, v1.8h, v5.h[1] + fadd v16.8h, v0.8h, v16.8h + fadd v20.8h, v1.8h, v20.8h PostTreatLH1x8: fmax v16.8h, v16.8h, v6.8h @@ -582,8 +673,14 @@ LoopE1: fmla v3.8h, v1.8h, v12.8h ld1 {v0.h}[0], [x15], x11 + cbnz x26, LE1H4_BLOCK_GT_0 fmul v16.8h, v3.8h, v0.h[0] + b LE1H4_INIT_END + LE1H4_BLOCK_GT_0: + ld1 {v16.8h}, [x0] + fmla v16.8h, v3.8h, v0.h[0] + LE1H4_INIT_END: beq E1LoopLREnd E1LoopLR: @@ -601,6 +698,7 @@ LoopE1: cbz x5, StoreLH1x4 AddBiasLH1x4: + cbz x20, PostTreatLH1x4 ld1 {v0.8h}, [x20] fmla v16.8h, v0.8h, v5.h[1] diff --git a/source/backend/cpu/CMakeLists.txt b/source/backend/cpu/CMakeLists.txt index 5c91ad5b6..22aeb1ef4 100644 --- a/source/backend/cpu/CMakeLists.txt +++ b/source/backend/cpu/CMakeLists.txt @@ -1,6 +1,4 @@ # CPU -option(MNN_SUPPORT_BF16 "Enable MNN's bf16 op" OFF) -option(MNN_LOW_MEMORY "Build MNN support low memory for weight quant model." OFF) if(MNN_SUPPORT_RENDER) FILE(GLOB MNN_CPU_SRC ${CMAKE_CURRENT_LIST_DIR}/* ${CMAKE_CURRENT_LIST_DIR}/compute/* ${CMAKE_CURRENT_LIST_DIR}/render/*) diff --git a/source/backend/cpu/CPUAttention.cpp b/source/backend/cpu/CPUAttention.cpp index 4d7b0b894..c37d9a3f7 100644 --- a/source/backend/cpu/CPUAttention.cpp +++ b/source/backend/cpu/CPUAttention.cpp @@ -29,8 +29,8 @@ namespace MNN { template static void prefill_pack(Tensor* query, Tensor* key, Tensor* value, char* query_ptr, char* key_ptr, char* value_ptr, - int mMaxLength, int mNumHead, int mHeadDim, int mValueH, - int eP, int hP, int query_e, int key_h, int seq_len, int h) { + int mMaxLength, int mNumHead, int mKvNumHead, int mHeadDim, int mValueH, + int eP, int hP, int query_e, int key_h, int seq_len, int h, int kv_h) { auto query_src = query->host(); auto key_src = key->host(); auto value_src = value->host(); @@ -54,7 +54,7 @@ static void prefill_pack(Tensor* query, Tensor* key, Tensor* value, char* query_ for (int k = 0; k < hP; k++) { int s = i * hP + k; if (s < seq_len) { - key_dst[i * mHeadDim * hP + j * hP + k] = key_src[s * mNumHead * mHeadDim + h * mHeadDim + j]; + key_dst[i * mHeadDim * hP + j * hP + k] = key_src[s * mKvNumHead * mHeadDim + kv_h * mHeadDim + j]; } } } @@ -65,7 +65,7 @@ static void prefill_pack(Tensor* query, Tensor* key, Tensor* value, char* query_ for (int k = 0; k < hP; k++) { int hd = i * hP + k; if (hd < mHeadDim) { - value_dst[i * mMaxLength * hP + j * hP + k] = value_src[j * mNumHead * mHeadDim + h * mHeadDim + hd]; + value_dst[i * mMaxLength * hP + j * hP + k] = value_src[j * mKvNumHead * mHeadDim + kv_h * mHeadDim + hd]; } } } @@ -74,7 +74,7 @@ static void prefill_pack(Tensor* query, Tensor* key, Tensor* value, char* query_ template static void decode_pack(Tensor* query, Tensor* key, Tensor* value, char* query_ptr, char* key_ptr, char* value_ptr, - int mMaxLength, int mPastLength, int mHeadDim, int mValueH, int eP, int hP, int h) { + int mMaxLength, int mPastLength, int mHeadDim, int mValueH, int eP, int hP, int h, int kv_h) { auto query_src = query->host(); auto key_src = key->host(); auto value_src = value->host(); @@ -88,12 +88,12 @@ static void decode_pack(Tensor* query, Tensor* key, Tensor* value, char* query_p int outside_offset = UP_DIV(mPastLength, hP); int inside_offset = mPastLength % hP; for (int i = 0; i < mHeadDim; i++) { - key_dst[(outside_offset - (inside_offset != 0)) * mHeadDim * hP + i * hP + inside_offset] = key_src[h * mHeadDim + i]; + key_dst[(outside_offset - (inside_offset != 0)) * mHeadDim * hP + i * hP + inside_offset] = key_src[kv_h * mHeadDim + i]; } // transpose value: [1, num_head, head_dim] -> numhead, [head_dim/hP, kv_seq_len, hP] for (int i = 0; i < mValueH; i++) { for (int j = 0; j < hP; j++) { - value_dst[i * mMaxLength * hP + mPastLength * hP + j] = value_src[h * mHeadDim + i * hP + j]; + value_dst[i * mMaxLength * hP + mPastLength * hP + j] = value_src[kv_h * mHeadDim + i * hP + j]; } } } @@ -163,51 +163,50 @@ static void decode_softmax(float* mask_qk, float* softmax_qk, char* unpack_qk, c } } -void CPUAttentionImpl::allocKVCache() { - if (!mKVCache || mPastLength < mMaxLength) { +void CPUAttention::allocKVCache() { + if (!mKVCache || mResource->mPastLength < mResource->mMaxLength) { return; } - mMaxLength = mPastLength + mExpandChunk; + mResource->mMaxLength = mResource->mPastLength + mResource->mExpandChunk; // past_key: [1, numhead, headdim, maxlen] -> numhead, [headdim, maxlen] -> pack_b -> numhead, [maxlen/hP, head_dim, hP] - mPastKey.reset(Tensor::createDevice({mNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP})); + mResource->mPastKey.reset(Tensor::createDevice({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), mResource->mHeadDim, hP})); // past_value: [1, numhead, maxlen, headdim] -> numhead, [maxlen, headdim] -> pack_b -> numhead, [head_dim/hP, max_len, hP] - mPastValue.reset(Tensor::createDevice({mNumHead, mValueH, mMaxLength, hP})); - backend()->onAcquireBuffer(mPastKey.get(), Backend::STATIC); - backend()->onAcquireBuffer(mPastValue.get(), Backend::STATIC); + mResource->mPastValue.reset(Tensor::createDevice({mResource->mKvNumHead, mResource->mValueH, mResource->mMaxLength, hP})); + backend()->onAcquireBuffer(mResource->mPastKey.get(), Backend::STATIC); + backend()->onAcquireBuffer(mResource->mPastValue.get(), Backend::STATIC); } -void CPUAttentionImpl::reallocKVCache() { - if (!mKVCache || mPastLength < mMaxLength) { +void CPUAttention::reallocKVCache() { + if (!mKVCache || mResource->mPastLength < mResource->mMaxLength) { return; } - mMaxLength = mPastLength + mExpandChunk; + mResource->mMaxLength = mResource->mPastLength + mResource->mExpandChunk; // past_key: [1, numhead, headdim, maxlen] -> numhead, [headdim, maxlen] -> pack_b -> numhead, [maxlen/hP, head_dim, hP] - auto new_key = Tensor::createDevice({mNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP}); + auto new_key = Tensor::createDevice({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), mResource->mHeadDim, hP}); // past_value: [1, numhead, maxlen, headdim] -> numhead, [maxlen, headdim] -> pack_b -> numhead, [head_dim/hP, max_len, hP] - auto new_value = Tensor::createDevice({mNumHead, mValueH, mMaxLength, hP}); + auto new_value = Tensor::createDevice({mResource->mKvNumHead, mResource->mValueH, mResource->mMaxLength, hP}); backend()->onAcquireBuffer(new_key, Backend::STATIC); backend()->onAcquireBuffer(new_value, Backend::STATIC); // copy - for (int h = 0; h < mNumHead; h++) { - ::memset(new_key->host() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * bytes, 0, UP_DIV(mMaxLength, hP) * mHeadDim * hP * bytes); - ::memset(new_value->host() + h * mValueH * mMaxLength * hP * bytes, 0, mValueH * mMaxLength * hP * bytes); - ::memcpy(new_key->host() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * bytes, - mPastKey->host() + h * UP_DIV(mPastLength, hP) * mHeadDim * hP * bytes, - UP_DIV(mPastLength, hP) * mHeadDim * hP * bytes); - for (int i = 0; i < mValueH; i++) { - ::memcpy(new_value->host() + (h * mValueH + i) * mMaxLength * hP * bytes, - mPastValue->host() + (h * mValueH + i) * mPastLength * hP * bytes, - mPastLength * hP * bytes); + for (int h = 0; h < mResource->mKvNumHead; h++) { + ::memset(new_key->host() + h * UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP * bytes, 0, UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP * bytes); + ::memset(new_value->host() + h * mResource->mValueH * mResource->mMaxLength * hP * bytes, 0, mResource->mValueH * mResource->mMaxLength * hP * bytes); + ::memcpy(new_key->host() + h * UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP * bytes, + mResource->mPastKey->host() + h * UP_DIV(mResource->mPastLength, hP) * mResource->mHeadDim * hP * bytes, + UP_DIV(mResource->mPastLength, hP) * mResource->mHeadDim * hP * bytes); + for (int i = 0; i < mResource->mValueH; i++) { + ::memcpy(new_value->host() + (h * mResource->mValueH + i) * mResource->mMaxLength * hP * bytes, + mResource->mPastValue->host() + (h * mResource->mValueH + i) * mResource->mPastLength * hP * bytes, + mResource->mPastLength * hP * bytes); } } - mPastKey.reset(new_key); - mPastValue.reset(new_value); - mTempQK.reset(Tensor::createDevice({mThreadNum, eP + 2, mMaxLength})); + mResource->mPastKey.reset(new_key); + mResource->mPastValue.reset(new_value); + mTempQK.reset(Tensor::createDevice({mThreadNum, eP + 2, mResource->mMaxLength})); backend()->onAcquireBuffer(mTempQK.get(), Backend::STATIC); } -ErrorCode CPUAttentionImpl::onResize(Backend* _backend, const std::vector& inputs, const std::vector& outputs) { - mBackend = _backend; +ErrorCode CPUAttention::onResize(const std::vector& inputs, const std::vector& outputs) { auto core = static_cast(backend())->functions(); int unit = core->pack; bytes = core->bytes; @@ -221,26 +220,27 @@ ErrorCode CPUAttentionImpl::onResize(Backend* _backend, const std::vectorthreadNumber(); mIsDecode = seq_len == 1; - if (mPastLength == 0 || seq_len > 1) { - mPastLength = seq_len; + if (mResource->mPastLength == 0 || seq_len > 1) { + mResource->mPastLength = seq_len; } - mNumHead = shape[2]; - mHeadDim = shape[3]; - mScale = 1.0 / sqrt(mHeadDim); - mValueH = UP_DIV(mHeadDim, hP); + mResource->mNumHead = shape[2]; + mResource->mKvNumHead = key->shape()[2]; + mResource->mHeadDim = shape[3]; + mResource->mScale = 1.0 / sqrt(mResource->mHeadDim); + mResource->mValueH = UP_DIV(mResource->mHeadDim, hP); int query_e = UP_DIV(seq_len, eP); int key_h = UP_DIV(seq_len, hP); // mPastLength = 10; // alloc kv cache allocKVCache(); - int tileCount = UP_DIV(mNumHead, mThreadNum); + int tileCount = UP_DIV(mResource->mNumHead, mThreadNum); // temp_query - mPackQ.reset(Tensor::createDevice({mThreadNum, query_e, mHeadDim, eP})); - mPackQKV.reset(Tensor::createDevice({mThreadNum, UP_DIV(mHeadDim, unit), seq_len, unit})); + mPackQ.reset(Tensor::createDevice({mThreadNum, query_e, mResource->mHeadDim, eP})); + mPackQKV.reset(Tensor::createDevice({mThreadNum, UP_DIV(mResource->mHeadDim, unit), seq_len, unit})); if (mIsDecode) { - mTempQK.reset(Tensor::createDevice({mThreadNum, eP + 2, mMaxLength})); + mTempQK.reset(Tensor::createDevice({mThreadNum, eP + 2, mResource->mMaxLength})); backend()->onAcquireBuffer(mTempQK.get(), Backend::DYNAMIC); } else { mTempQK.reset(Tensor::createDevice({mThreadNum, 4, seq_len, seq_len})); @@ -254,12 +254,11 @@ ErrorCode CPUAttentionImpl::onResize(Backend* _backend, const std::vector& inputs, const std::vector& outputs) { +ErrorCode CPUAttention::onExecute(const std::vector& inputs, const std::vector& outputs) { auto core = static_cast(backend())->functions(); int unit = core->pack; bytes = core->bytes; core->MNNGetMatMulPackMode(&eP, &lP, &hP); - mBackend = _backend; auto matmulUnit = core->MNNPackedMatMul; auto matmulRemain = core->MNNPackedMatMulRemain; @@ -272,37 +271,40 @@ ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vectorthreadNumber(); mIsDecode = seq_len == 1; - if (mPastLength == 0 || seq_len > 1) { - mPastLength = seq_len; + if (mResource->mPastLength == 0 || seq_len > 1) { + mResource->mPastLength = seq_len; } - mNumHead = shape[2]; - mHeadDim = shape[3]; - mScale = 1.0 / sqrt(mHeadDim); - mValueH = UP_DIV(mHeadDim, hP); + mResource->mNumHead = shape[2]; + mResource->mKvNumHead = key->shape()[2]; + int group_size = mResource->mNumHead / mResource->mKvNumHead; + mResource->mHeadDim = shape[3]; + mResource->mScale = 1.0 / sqrt(mResource->mHeadDim); + mResource->mValueH = UP_DIV(mResource->mHeadDim, hP); int query_e = UP_DIV(seq_len, eP); int key_h = UP_DIV(seq_len, hP); // mPastLength = 10; - int tileCount = UP_DIV(mNumHead, mThreadNum); + int tileCount = UP_DIV(mResource->mNumHead, mThreadNum); // try calloc kv cache mPrefill = [=](int tId){ - auto pack_q = mPackQ->host() + tId * query_e * mHeadDim * eP * bytes; + auto pack_q = mPackQ->host() + tId * query_e * mResource->mHeadDim * eP * bytes; auto pack_qk = mTempQK->host() + tId * 4 * seq_len * seq_len * bytes; auto unpack_qk = pack_qk + seq_len * seq_len * 2 * bytes; auto mask_qk = reinterpret_cast(pack_qk); auto softmax_qk = reinterpret_cast(unpack_qk); - auto pack_qkv = mPackQKV->host() + tId * UP_DIV(mHeadDim, unit) * seq_len * unit * bytes; + auto pack_qkv = mPackQKV->host() + tId * UP_DIV(mResource->mHeadDim, unit) * seq_len * unit * bytes; int head_index = tId * tileCount; - for (int h = head_index; h < head_index + tileCount && h < mNumHead; h++) { + for (int h = head_index; h < head_index + tileCount && h < mResource->mNumHead; h++) { // pack for matmul - auto key_dst = mPastKey->host() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * bytes; - auto value_dst = mPastValue->host() + h * mValueH * mMaxLength * hP * bytes; + int kv_h = h / group_size; + auto key_dst = mResource->mPastKey->host() + kv_h * UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP * bytes; + auto value_dst = mResource->mPastValue->host() + kv_h * mResource->mValueH * mResource->mMaxLength * hP * bytes; if (bytes == 2) { - prefill_pack(query, key, value, pack_q, key_dst, value_dst, mMaxLength, mNumHead, mHeadDim, mValueH, eP, hP, query_e, key_h, seq_len, h); + prefill_pack(query, key, value, pack_q, key_dst, value_dst, mResource->mMaxLength, mResource->mNumHead, mResource->mKvNumHead, mResource->mHeadDim, mResource->mValueH, eP, hP, query_e, key_h, seq_len, h, kv_h); } else { - prefill_pack(query, key, value, pack_q, key_dst, value_dst, mMaxLength, mNumHead, mHeadDim, mValueH, eP, hP, query_e, key_h, seq_len, h); + prefill_pack(query, key, value, pack_q, key_dst, value_dst, mResource->mMaxLength, mResource->mNumHead, mResource->mKvNumHead, mResource->mHeadDim, mResource->mValueH, eP, hP, query_e, key_h, seq_len, h, kv_h); } // query @ key int loop_e = seq_len / eP; @@ -311,32 +313,32 @@ ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vectormHeadDim; parameters[2] = seq_len; parameters[3] = seq_len * unit * bytes; parameters[4] = 0; parameters[5] = 0; - matmulUnit((float*)(pack_qk + (i * eP * unit) * bytes), (float*)(pack_q + (i * mHeadDim * eP) * bytes), (float*)key_dst, parameters, nullptr, nullptr, nullptr, nullptr); + matmulUnit((float*)(pack_qk + (i * eP * unit) * bytes), (float*)(pack_q + (i * mResource->mHeadDim * eP) * bytes), (float*)key_dst, parameters, nullptr, nullptr, nullptr, nullptr); } { size_t shapeParameters[6]; size_t* parameters = shapeParameters; parameters[0] = eP * bytes; - parameters[1] = mHeadDim; + parameters[1] = mResource->mHeadDim; parameters[2] = seq_len; parameters[3] = seq_len * unit * bytes; parameters[4] = 0; parameters[5] = 0; - matmulRemain((float*)(pack_qk + (loop_e * eP * unit) * bytes), (float*)(pack_q + (loop_e * mHeadDim * eP) * bytes), (float*)key_dst, remain, parameters, nullptr, nullptr, nullptr, nullptr); + matmulRemain((float*)(pack_qk + (loop_e * eP * unit) * bytes), (float*)(pack_q + (loop_e * mResource->mHeadDim * eP) * bytes), (float*)key_dst, remain, parameters, nullptr, nullptr, nullptr, nullptr); } int area_offset[1] {seq_len}; core->MNNUnpackCUnitTranspose((float*)unpack_qk, (float*)pack_qk, seq_len, seq_len, area_offset); // div scale and mask auto mask_ptr = mask->host(); if (bytes == 2) { - prefill_softmax(mask_ptr, mask_qk, softmax_qk, unpack_qk, pack_qk, mScale, eP, query_e, seq_len, -65504.0, float_mask); + prefill_softmax(mask_ptr, mask_qk, softmax_qk, unpack_qk, pack_qk, mResource->mScale, eP, query_e, seq_len, -65504.0, float_mask); } else { - prefill_softmax(mask_ptr, mask_qk, softmax_qk, unpack_qk, pack_qk, mScale, eP, query_e, seq_len, std::numeric_limits::lowest(), float_mask); + prefill_softmax(mask_ptr, mask_qk, softmax_qk, unpack_qk, pack_qk, mResource->mScale, eP, query_e, seq_len, std::numeric_limits::lowest(), float_mask); } // qk @ v for (int i = 0 ; i < loop_e; i++) { @@ -344,10 +346,10 @@ ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vectormHeadDim; parameters[3] = seq_len * unit * bytes; parameters[4] = 0; - parameters[5] = (mMaxLength - seq_len) * hP * bytes; + parameters[5] = (mResource->mMaxLength - seq_len) * hP * bytes; matmulUnit((float*)(pack_qkv + (i * eP * unit) * bytes), (float*)(pack_qk + (i * seq_len * eP) * bytes), (float*)value_dst, parameters, nullptr, nullptr, nullptr, nullptr); } { @@ -355,46 +357,47 @@ ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vectormHeadDim; parameters[3] = seq_len * unit * bytes; parameters[4] = 0; - parameters[5] = (mMaxLength - seq_len) * hP * bytes; + parameters[5] = (mResource->mMaxLength - seq_len) * hP * bytes; matmulRemain((float*)(pack_qkv + (loop_e * eP * unit) * bytes), (float*)(pack_qk + (loop_e * seq_len * eP) * bytes), (float*)value_dst, remain, parameters, nullptr, nullptr, nullptr, nullptr); } // transpose: [head_dim/unit, seq_len, unit] -> [seq_len, num_head, head_dim] - auto dst_ptr = outputs[0]->host() + h * mHeadDim * bytes; + auto dst_ptr = outputs[0]->host() + h * mResource->mHeadDim * bytes; if (bytes == 2) { - prefill_unpack(pack_qkv, dst_ptr, mNumHead, mHeadDim, unit, seq_len); + prefill_unpack(pack_qkv, dst_ptr, mResource->mNumHead, mResource->mHeadDim, unit, seq_len); } else { - prefill_unpack(pack_qkv, dst_ptr, mNumHead, mHeadDim, unit, seq_len); + prefill_unpack(pack_qkv, dst_ptr, mResource->mNumHead, mResource->mHeadDim, unit, seq_len); } } }; mDecode = [=](int tId) { - int kv_seq_len = mPastLength + 1; - auto pack_q = mPackQ->host() + tId * mHeadDim * eP * bytes; + int kv_seq_len = mResource->mPastLength + 1; + auto pack_q = mPackQ->host() + tId * mResource->mHeadDim * eP * bytes; auto pack_qk = mTempQK->host() + tId * (eP + 2) * kv_seq_len * bytes; auto unpack_qk = pack_qk + kv_seq_len * eP * bytes; auto mask_qk = reinterpret_cast(pack_qk); auto softmax_qk = reinterpret_cast(unpack_qk); - auto pack_qkv = mPackQKV->host() + tId * UP_DIV(mHeadDim, unit) * unit * bytes; + auto pack_qkv = mPackQKV->host() + tId * UP_DIV(mResource->mHeadDim, unit) * unit * bytes; int head_index = tId * tileCount; - for (int h = head_index; h < head_index + tileCount && h < mNumHead; h++) { - auto key_dst = mPastKey->host() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * bytes; - auto value_dst = mPastValue->host() + h * mValueH * mMaxLength * hP * bytes; + for (int h = head_index; h < head_index + tileCount && h < mResource->mNumHead; h++) { + int kv_h = h / group_size; + auto key_dst = mResource->mPastKey->host() + kv_h * UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP * bytes; + auto value_dst = mResource->mPastValue->host() + kv_h * mResource->mValueH * mResource->mMaxLength * hP * bytes; // pack for matmul if (bytes == 2) { - decode_pack(query, key, value, pack_q, key_dst, value_dst, mMaxLength, mPastLength, mHeadDim, mValueH, eP, hP, h); + decode_pack(query, key, value, pack_q, key_dst, value_dst, mResource->mMaxLength, mResource->mPastLength, mResource->mHeadDim, mResource->mValueH, eP, hP, h, kv_h); } else { - decode_pack(query, key, value, pack_q, key_dst, value_dst, mMaxLength, mPastLength, mHeadDim, mValueH, eP, hP, h); + decode_pack(query, key, value, pack_q, key_dst, value_dst, mResource->mMaxLength, mResource->mPastLength, mResource->mHeadDim, mResource->mValueH, eP, hP, h, kv_h); } // query @ key: [1, head_dim] @ [head_dim, kv_seq_len] -> [1, kv_seq_len] size_t shapeParameters[6]; size_t* parameters = shapeParameters; parameters[0] = eP * bytes; - parameters[1] = mHeadDim; + parameters[1] = mResource->mHeadDim; parameters[2] = kv_seq_len; parameters[3] = seq_len * unit * bytes; parameters[4] = 0; @@ -403,9 +406,9 @@ ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vectorMNNUnpackCUnitTranspose((float*)unpack_qk, (float*)pack_qk, seq_len, kv_seq_len, area_offset); if (bytes == 2) { - decode_softmax(mask_qk, softmax_qk, unpack_qk, pack_qk, mScale, eP, kv_seq_len); + decode_softmax(mask_qk, softmax_qk, unpack_qk, pack_qk, mResource->mScale, eP, kv_seq_len); } else { - decode_softmax(mask_qk, softmax_qk, unpack_qk, pack_qk, mScale, eP, kv_seq_len); + decode_softmax(mask_qk, softmax_qk, unpack_qk, pack_qk, mResource->mScale, eP, kv_seq_len); } // qk @ v: [1, kv_seq_len] @ [kv_seq_len, head_dim] -> [1, head_dim] { @@ -413,14 +416,14 @@ ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vectormHeadDim; parameters[3] = 1 * unit * bytes; - parameters[5] = (mMaxLength - kv_seq_len) * hP * bytes; + parameters[5] = (mResource->mMaxLength - kv_seq_len) * hP * bytes; matmulRemain((float*)pack_qkv, (float*)pack_qk, (float*)value_dst, 1, parameters, nullptr, nullptr, nullptr, nullptr); } // transpose: [head_dim/unit, 1, unit] -> [1, num_head, head_dim] - auto dst_ptr = outputs[0]->host() + h * mHeadDim * bytes; - core->MNNUnpackCUnitTranspose((float*)dst_ptr, (float*)pack_qkv, 1, mHeadDim, area_offset); + auto dst_ptr = outputs[0]->host() + h * mResource->mHeadDim * bytes; + core->MNNUnpackCUnitTranspose((float*)dst_ptr, (float*)pack_qkv, 1, mResource->mHeadDim, area_offset); } }; mFunction = mIsDecode ? mDecode : mPrefill; @@ -430,32 +433,25 @@ ErrorCode CPUAttentionImpl::onExecute(Backend* _backend, const std::vectormPastLength += mIsDecode; return NO_ERROR; } -CPUAttention::CPUAttention(Backend* backend, bool kv_cahce) : Execution(backend) { - mImpl.reset(new CPUAttentionImpl(backend, kv_cahce)); -} - -CPUAttention::CPUAttention(std::shared_ptr impl, Backend *backend) : Execution(backend), mImpl(impl) {} - -ErrorCode CPUAttention::onResize(const std::vector& inputs, const std::vector& outputs) { - return mImpl->onResize(backend(), inputs, outputs); -} - -ErrorCode CPUAttention::onExecute(const std::vector& inputs, const std::vector& outputs) { - return mImpl->onExecute(backend(), inputs, outputs); -} - bool CPUAttention::onClone(Backend* bn, const Op* op, Execution** dst) { if (nullptr == dst) { return true; } - *dst = new CPUAttention(mImpl, bn); + auto tmp = new CPUAttention(bn, mKVCache); + tmp->mResource = mResource; + *dst = tmp; return true; } +CPUAttention::CPUAttention(Backend *backend, bool kv_cache) : Execution(backend) { + mKVCache = kv_cache; + mResource.reset(new Resource); +} + class CPUAttentionCreator : public CPUBackend::Creator { public: virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, diff --git a/source/backend/cpu/CPUAttention.hpp b/source/backend/cpu/CPUAttention.hpp index 1cbc26aba..6e3154db7 100644 --- a/source/backend/cpu/CPUAttention.hpp +++ b/source/backend/cpu/CPUAttention.hpp @@ -16,41 +16,32 @@ namespace MNN { -class CPUAttentionImpl { -public: - CPUAttentionImpl(Backend *backend, bool kv_cache) : mBackend(backend), mKVCache(kv_cache) {} - ~CPUAttentionImpl() = default; - ErrorCode onResize(Backend *backend, const std::vector &inputs, const std::vector &outputs); - ErrorCode onExecute(Backend *backend, const std::vector &inputs, const std::vector &outputs); -private: - void allocKVCache(); - void reallocKVCache(); - Backend* backend() { return mBackend; } -private: - Backend* mBackend; - bool mKVCache; - float mScale; - const int mExpandChunk = 64; - int mThreadNum = 1; - bool mIsDecode = false; - int mPastLength = 0, mMaxLength = 0; - std::shared_ptr mPastKey, mPastValue, mTempQK; - std::shared_ptr mPackQ, mPackQKV; - int mNumHead = 0, mHeadDim = 0, mValueH = 0; - int eP, lP, hP, bytes; - std::function mFunction, mPrefill, mDecode; -}; class CPUAttention : public Execution { public: CPUAttention(Backend *backend, bool kv_cache); - CPUAttention(std::shared_ptr impl, Backend *backend); virtual ~CPUAttention() = default; virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override; + struct Resource { + std::shared_ptr mPastKey; + std::shared_ptr mPastValue; + float mScale; + const int mExpandChunk = 64; + int mPastLength = 0, mMaxLength = 0; + int mNumHead = 0, mKvNumHead = 0, mHeadDim = 0, mValueH = 0; + }; private: - std::shared_ptr mImpl; + void allocKVCache(); + void reallocKVCache(); + bool mIsDecode = false; + bool mKVCache; + int mThreadNum = 1; + std::shared_ptr mResource; + std::shared_ptr mTempQK, mPackQ, mPackQKV; + int eP, lP, hP, bytes; + std::function mFunction, mPrefill, mDecode; }; } // namespace MNN diff --git a/source/backend/cpu/CPURaster.cpp b/source/backend/cpu/CPURaster.cpp index 6d5f587b7..3722e8a27 100644 --- a/source/backend/cpu/CPURaster.cpp +++ b/source/backend/cpu/CPURaster.cpp @@ -93,6 +93,7 @@ ErrorCode CPURaster::onResize(const std::vector &____inputs, const std } // input is NC4HW4 add Convert std::vector forRelease; + TensorUtils::FuseWrap fuseUtils; for (int i=0; i< des->regions.size(); ++i) { auto& slice = des->regions[i]; auto origin = slice.origin; @@ -125,10 +126,11 @@ ErrorCode CPURaster::onResize(const std::vector &____inputs, const std regionTmp.size[1] = core->pack; regionTmp.size[2] = area; regionTmp.origin = slice.origin; - std::shared_ptr newSlice(new Tensor::InsideDescribe::Region); - *newSlice = slice; - bool merge = TensorUtils::fuseRegion(regionTmp, *newSlice); + bool merge = fuseUtils.match(regionTmp, slice); if (merge) { + std::shared_ptr newSlice(new Tensor::InsideDescribe::Region); + *newSlice = slice; + fuseUtils.apply(regionTmp, *newSlice); // cache the merged tensor mTempInputCopy.emplace_back(std::make_pair(origin, newSlice.get())); mCacheRegions.emplace_back(newSlice); diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int4.S b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int4.S index e0dabde36..337c419a6 100644 --- a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int4.S +++ b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int4.S @@ -12,6 +12,13 @@ .text .align 5 +.macro MNN_ADD_FLAOT s0, s1, s2, s3, z0, z1, z2, z3 + fadd \s0\().4s, \s0\().4s, \z0\().4s + fadd \s1\().4s, \s1\().4s, \z1\().4s + fadd \s2\().4s, \s2\().4s, \z2\().4s + fadd \s3\().4s, \s3\().4s, \z3\().4s + +.endm // 12 * 8 MatMul asm_function MNNPackedMatMulRemain_int4 //void MNNPackedMatMulRemain_int4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b); @@ -36,8 +43,10 @@ ldr x9, [x4, #8] // l ldr x10, [x4, #16] // h ldr x7, [x4, #24] // cStride -ldr x19, [x4, #40] // bExtraStride +ldr x19, [x4, #40] // bExtraStride = (LSize - l) * (hP * sizeof(int4_t)) +ldr x26, [x4, #48] // blockId +//add x19, x19, x9, LSL #2 // bStride = (hP * sizeof(int4_t)) * l + bExtraStride add x10, x10, #3 lsr x10, x10, #2 @@ -47,6 +56,7 @@ dup v6.4s, v5.s[2] // Min Value dup v7.4s, v5.s[3] // Max Value Start: +sub x25, x7, #64 E8: cmp x3, #8 @@ -195,8 +205,20 @@ LoopE8: fmla v30.4s, v15.4s, v1.s[2] fmla v31.4s, v15.4s, v1.s[3] - cbz x5, StoreLH8 + cbz x26, AddBiasLH8 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], x25 + MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3 + MNN_ADD_FLAOT v24, v25, v26, v27, v8, v9, v10, v11 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0] + MNN_ADD_FLAOT v20, v21, v22, v23, v0, v1, v2, v3 + MNN_ADD_FLAOT v28, v29, v30, v31, v8, v9, v10, v11 + sub x0, x0, #128 + sub x0, x0, x25 + AddBiasLH8: + cbz x20, PostTreatLH8 ld1 {v0.4s, v1.4s}, [x20], #32 fmla v16.4s, v0.4s, v5.s[1] @@ -220,6 +242,7 @@ LoopE8: fmla v31.4s, v1.4s, v5.s[1] PostTreatLH8: + cbz x5, StoreLH8 fmax v16.4s, v16.4s, v6.4s fmax v17.4s, v17.4s, v6.4s fmax v18.4s, v18.4s, v6.4s @@ -355,8 +378,22 @@ LoopE8: fmla v22.4s, v14.4s, v1.s[2] fmla v23.4s, v14.4s, v1.s[3] - cbz x5, StoreLH8x4 + cbz x26, AddBiasLH8x4 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0] + fadd v16.4s, v16.4s, v0.4s + fadd v17.4s, v17.4s, v1.4s + fadd v18.4s, v18.4s, v2.4s + fadd v19.4s, v19.4s, v3.4s + + fadd v20.4s, v20.4s, v8.4s + fadd v21.4s, v21.4s, v9.4s + fadd v22.4s, v22.4s, v10.4s + fadd v23.4s, v23.4s, v11.4s + sub x0, x0, #64 + AddBiasLH8x4: + cbz x20, PostTreatLH8x4 ld1 {v0.4s}, [x20] fmla v16.4s, v0.4s, v5.s[1] @@ -370,6 +407,7 @@ LoopE8: fmla v23.4s, v0.4s, v5.s[1] PostTreatLH8x4: + cbz x5, StoreLH8x4 fmax v16.4s, v16.4s, v6.4s fmax v17.4s, v17.4s, v6.4s fmax v18.4s, v18.4s, v6.4s @@ -444,6 +482,7 @@ blt E1 fadd v4.4s, v4.4s, v15.4s ld1 {v0.4s}, [x15], x11 + cbnz x26, LE4H8_BLOCK_GT_0 fmul v16.4s, v3.4s, v0.s[0] fmul v17.4s, v3.4s, v0.s[1] fmul v18.4s, v3.4s, v0.s[2] @@ -453,7 +492,24 @@ blt E1 fmul v21.4s, v4.4s, v0.s[1] fmul v22.4s, v4.4s, v0.s[2] fmul v23.4s, v4.4s, v0.s[3] + b LE4H8_INIT_END + + LE4H8_BLOCK_GT_0: + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x7 + ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0] + fmla v16.4s, v3.4s, v0.s[0] + fmla v17.4s, v3.4s, v0.s[1] + fmla v18.4s, v3.4s, v0.s[2] + fmla v19.4s, v3.4s, v0.s[3] + fmla v20.4s, v4.4s, v0.s[0] + fmla v21.4s, v4.4s, v0.s[1] + fmla v22.4s, v4.4s, v0.s[2] + fmla v23.4s, v4.4s, v0.s[3] + sub x0, x0, x7 + + + LE4H8_INIT_END: beq E4LoopLEnd subs x12, x12, #1 @@ -524,9 +580,8 @@ blt E1 sub x8, x8, #2 cmp x8, #2 - cbz x5, StoreLH4x8 - AddBiasLH4x8: + cbz x20, PostTreatLH4x8 ld1 {v0.4s, v1.4s}, [x20], #32 fmla v16.4s, v0.4s, v5.s[1] @@ -540,6 +595,7 @@ blt E1 fmla v23.4s, v1.4s, v5.s[1] PostTreatLH4x8: + cbz x5, StoreLH4x8 fmax v16.4s, v16.4s, v6.4s fmax v17.4s, v17.4s, v6.4s fmax v18.4s, v18.4s, v6.4s @@ -593,11 +649,22 @@ blt E1 fadd v3.4s, v3.4s, v14.4s ld1 {v0.4s}, [x15], x11 + + cbnz x26, LE4H4_BLOCK_GT_0 fmul v16.4s, v3.4s, v0.s[0] fmul v17.4s, v3.4s, v0.s[1] fmul v18.4s, v3.4s, v0.s[2] fmul v19.4s, v3.4s, v0.s[3] + b LE4H4_INIT_END + LE4H4_BLOCK_GT_0: + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] + fmla v16.4s, v3.4s, v0.s[0] + fmla v17.4s, v3.4s, v0.s[1] + fmla v18.4s, v3.4s, v0.s[2] + fmla v19.4s, v3.4s, v0.s[3] + + LE4H4_INIT_END: beq E4LoopLREnd E4LoopLR: @@ -623,8 +690,8 @@ blt E1 bne E4LoopLR E4LoopLREnd: - cbz x5, StoreLH4x4 AddBiasLH4x4: + cbz x20, PostTreatLH4x4 ld1 {v0.4s}, [x20] fmla v16.4s, v0.4s, v5.s[1] @@ -634,6 +701,7 @@ blt E1 PostTreatLH4x4: + cbz x5, StoreLH4x4 fmax v16.4s, v16.4s, v6.4s fmax v17.4s, v17.4s, v6.4s fmax v18.4s, v18.4s, v6.4s @@ -777,14 +845,22 @@ LoopE1: fadd v16.4s, v14.4s, v16.4s fadd v20.4s, v15.4s, v20.4s - cbz x5, StoreLH1x8 + cbz x26, AddBiasLH1x8 + ld1 {v2.4s}, [x0], x7 + ld1 {v3.4s}, [x0] + sub x0, x0, x7 + fadd v16.4s, v16.4s, v2.4s + fadd v20.4s, v20.4s, v3.4s + AddBiasLH1x8: + cbz x20, PostTreatLH1x8 ld1 {v0.4s, v1.4s}, [x20], #32 fmla v16.4s, v0.4s, v5.s[1] fmla v20.4s, v1.4s, v5.s[1] PostTreatLH1x8: + cbz x5, StoreLH1x8 fmax v16.4s, v16.4s, v6.4s fmax v20.4s, v20.4s, v6.4s fmin v16.4s, v16.4s, v7.4s @@ -839,12 +915,18 @@ LoopE1: fmul v16.4s, v12.4s, v16.4s fmla v16.4s, v14.4s, v4.s[0] - cbz x5, StoreLH1x4 + cbz x26, AddBiasLH1x4 + ld1 {v0.4s}, [x0] + fadd v16.4s, v16.4s, v0.4s + b PostTreatLH1x4 + AddBiasLH1x4: + cbz x20, PostTreatLH1x4 ld1 {v0.4s}, [x20] fmla v16.4s, v0.4s, v5.s[1] PostTreatLH1x4: + cbz x5, StoreLH1x4 fmax v16.4s, v16.4s, v6.4s fmin v16.4s, v16.4s, v7.4s diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int8.S b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int8.S index 4daaf415c..1b9677c13 100644 --- a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int8.S +++ b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int8.S @@ -12,6 +12,14 @@ .text .align 5 + +.macro MNN_ADD_FLAOT s0, s1, s2, s3, z0, z1, z2, z3 + fadd \s0\().4s, \s0\().4s, \z0\().4s + fadd \s1\().4s, \s1\().4s, \z1\().4s + fadd \s2\().4s, \s2\().4s, \z2\().4s + fadd \s3\().4s, \s3\().4s, \z3\().4s + +.endm // 12 * 8 MatMul asm_function MNNPackedMatMulRemain_int8 //void MNNPackedMatMulRemain_int4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b); @@ -34,17 +42,19 @@ ldr x10, [x4, #16] // h ldr x7, [x4, #24] // cStride ldr x19, [x4, #40] // bExtraStride +ldr x26, [x4, #48] // blockId add x10, x10, #3 lsr x10, x10, #2 -lsl x25, x9, #3 // l*hPack -add x25, x25, x19 +//lsl x25, x9, #3 // l*hPack +//add x25, x25, x19 cbz x5, Start ld1 {v5.4s}, [x5] dup v6.4s, v5.s[2] // Min Value dup v7.4s, v5.s[3] // Max Value Start: +sub x25, x7, #64 E8: cmp x3, #8 @@ -143,8 +153,21 @@ LoopE8: sub x8, x8, #2 cmp x8, #2 - cbz x5, StoreLH8 + + cbz x26, AddBiasLH8 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], x25 + MNN_ADD_FLAOT v24, v25, v26, v27, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + MNN_ADD_FLAOT v20, v21, v22, v23, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + MNN_ADD_FLAOT v28, v29, v30, v31, v0, v1, v2, v3 + sub x0, x0, #128 + sub x0, x0, x25 + AddBiasLH8: + cbz x5, StoreLH8 ld1 {v0.4s, v1.4s}, [x20], #32 fmla v16.4s, v0.4s, v5.s[1] @@ -277,8 +300,15 @@ LoopE8: bne LoopLR LoopLREnd: - cbz x5, StoreLH8x4 + cbz x26, AddBiasLH8x4 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + MNN_ADD_FLAOT v20, v21, v22, v23, v0, v1, v2, v3 + sub x0, x0, #64 + AddBiasLH8x4: + cbz x20, PostTreatLH8x4 ld1 {v0.4s}, [x20] fmla v16.4s, v0.4s, v5.s[1] @@ -292,6 +322,7 @@ LoopE8: fmla v23.4s, v0.4s, v5.s[1] PostTreatLH8x4: + cbz x5, StoreLH8x4 fmax v16.4s, v16.4s, v6.4s fmax v17.4s, v17.4s, v6.4s fmax v18.4s, v18.4s, v6.4s @@ -433,9 +464,15 @@ blt E1 sub x8, x8, #2 cmp x8, #2 - cbz x5, StoreLH4x8 + cbz x26, AddBiasLH4x8 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], x7 + MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + MNN_ADD_FLAOT v20, v21, v22, v23, v0, v1, v2, v3 + sub x0, x0, x7 AddBiasLH4x8: + cbz x20, PostTreatLH4x8 ld1 {v0.4s, v1.4s}, [x20], #32 fmla v16.4s, v0.4s, v5.s[1] @@ -449,6 +486,7 @@ blt E1 fmla v23.4s, v1.4s, v5.s[1] PostTreatLH4x8: + cbz x5, StoreLH4x8 fmax v16.4s, v16.4s, v6.4s fmax v17.4s, v17.4s, v6.4s fmax v18.4s, v18.4s, v6.4s @@ -523,8 +561,12 @@ blt E1 bne E4LoopLR E4LoopLREnd: - cbz x5, StoreLH4x4 + cbz x26, AddBiasLH4x4 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3 + AddBiasLH4x4: + cbz x5, StoreLH4x4 ld1 {v0.4s}, [x20] fmla v16.4s, v0.4s, v5.s[1] @@ -620,8 +662,15 @@ LoopE1: sub x8, x8, #2 cmp x8, #2 - cbz x5, StoreLH1x8 + cbz x26, AddBiasLH1x8 + ld1 {v0.4s}, [x0], x7 + ld1 {v1.4s}, [x0] + fadd v16.4s, v16.4s, v0.4s + fadd v20.4s, v20.4s, v1.4s + sub x0, x0, x7 + AddBiasLH1x8: + cbz x5, StoreLH1x8 ld1 {v0.4s, v1.4s}, [x20], #32 fmla v16.4s, v0.4s, v5.s[1] @@ -676,8 +725,12 @@ LoopE1: bne E1LoopLR E1LoopLREnd: - cbz x5, StoreLH1x4 + cbz x26, AddBiasLH1x4 + ld1 {v0.4s}, [x0] + fadd v16.4s, v16.4s, v0.4s + AddBiasLH1x4: + cbz x5, StoreLH1x4 ld1 {v0.4s}, [x20] fmla v16.4s, v0.4s, v5.s[1] diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int4.S b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int4.S index 4528e96dd..fe2691168 100644 --- a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int4.S +++ b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int4.S @@ -11,22 +11,36 @@ .text .align 5 + +.macro MNN_ADD_FLAOT s0, s1, s2, s3, z0, z1, z2, z3 + fadd \s0\().4s, \s0\().4s, \z0\().4s + fadd \s1\().4s, \s1\().4s, \z1\().4s + fadd \s2\().4s, \s2\().4s, \z2\().4s + fadd \s3\().4s, \s3\().4s, \z3\().4s + +.endm + // 12 * 8 MatMul asm_function MNNPackedMatMul_int4 //void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias); // x0: C, x1:A, x2:B, x3:parameter, x4: postParameters, x5:bias, x6: k, x7: b -stp d14, d15, [sp, #-80]! +stp d14, d15, [sp, #-112]! stp d12, d13, [sp, #16] stp d10, d11, [sp, #32] stp d8, d9, [sp, #48] stp x19, x20, [sp, #64] +stp x21, x22, [sp, #80] +stp x23, x24, [sp, #96] //ldr x8, [x3, #0] // deprecated ldr x9, [x3, #8] // l ldr x10, [x3, #16] // h ldr x13, [x3, #24] // cStride -ldr x11, [x3, #40] // bExtraStride +ldr x11, [x3, #40] // bExtraStride = (LSize - l) * (hP * sizeof(int4_t)) +ldr x21, [x3, #48] // blockId + +//add x11, x11, x9, LSL #2 // bStride = (hP * sizeof(int4_t)) * l + bExtraStride // v0, v1, v2: A // v3, v4: B @@ -45,7 +59,7 @@ cmp x10, #2 blt LH4 LH8: -// sub x14, x13, #160 +sub x14, x13, #128 LoopH: mov x15, x1 @@ -231,14 +245,33 @@ LoopH: fmla v30.4s, v7.4s, v2.s[2] fmla v31.4s, v7.4s, v2.s[3] - cbz x4, StoreLH8 + cbz x21, AddBiasLH8 + // add dst value + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + MNN_ADD_FLAOT v8, v9, v10, v11, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + MNN_ADD_FLAOT v12, v13, v14, v15, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], x14 + MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3 + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + MNN_ADD_FLAOT v20, v21, v22, v23, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + MNN_ADD_FLAOT v24, v25, v26, v27, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + MNN_ADD_FLAOT v28, v29, v30, v31, v0, v1, v2, v3 + sub x0, x0, #256 + sub x0, x0, x14 + AddBiasLH8: + cbz x4, StoreLH8 ld1 {v5.4s}, [x4] dup v6.4s, v5.s[2] // Min Value dup v7.4s, v5.s[3] // Max Value - ld1 {v0.4s, v1.4s}, [x5], #32 + cbz x5, PostTreatLH8 + ld1 {v0.4s, v1.4s}, [x5], #32 fmla v8.4s, v0.4s, v5.s[1] fmla v9.4s, v0.4s, v5.s[1] fmla v10.4s, v0.4s, v5.s[1] @@ -452,11 +485,22 @@ LoopHRemain: fmla v18.4s, v21.4s, v6.s[2] fmla v19.4s, v21.4s, v6.s[3] - cbz x4, StoreLH4 + cbz x21, AddBiasLH4 + // add dst value + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + MNN_ADD_FLAOT v8, v9, v10, v11, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + MNN_ADD_FLAOT v12, v13, v14, v15, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3 + sub x0, x0, #128 + AddBiasLH4: + cbz x4, StoreLH4 ld1 {v5.4s}, [x4] dup v6.4s, v5.s[2] // Min Value dup v7.4s, v5.s[3] // Max Value + cbz x5, PostTreatLH4 ld1 {v0.4s}, [x5], #16 fmla v8.4s, v0.4s, v5.s[1] @@ -511,11 +555,13 @@ LoopHRemain: End: +ldp x23, x24, [sp, #96] +ldp x21, x22, [sp, #80] ldp x19, x20, [sp, #64] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #32] ldp d12, d13, [sp, #16] -ldp d14, d15, [sp], #80 +ldp d14, d15, [sp], #112 ret diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int8.S b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int8.S index 346cba3c5..e4716e99f 100644 --- a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int8.S +++ b/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int8.S @@ -11,23 +11,33 @@ .text .align 5 + +.macro MNN_ADD_FLAOT s0, s1, s2, s3, z0, z1, z2, z3 + fadd \s0\().4s, \s0\().4s, \z0\().4s + fadd \s1\().4s, \s1\().4s, \z1\().4s + fadd \s2\().4s, \s2\().4s, \z2\().4s + fadd \s3\().4s, \s3\().4s, \z3\().4s + +.endm // 12 * 8 MatMul asm_function MNNPackedMatMul_int8 //void MNNPackedMatMul(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias); // x0: C, x1:A, x2:B, x3:parameter, x4: postParameters, x5:bias, x6: k, x7: b -stp d14, d15, [sp, #-80]! +stp d14, d15, [sp, #-96]! stp d12, d13, [sp, #16] stp d10, d11, [sp, #32] stp d8, d9, [sp, #48] stp x19, x20, [sp, #64] +stp x21, x22, [sp, #80] //ldr x8, [x3, #0] // deprecated ldr x9, [x3, #8] // l ldr x10, [x3, #16] // h ldr x13, [x3, #24] // cStride -ldr x11, [x3, #40] // bExtraStride - +ldr x11, [x3, #40] // bExtraStride = (LSize - l) * hP +ldr x21, [x3, #48] // blockId +//add x11, x11, x9, LSL #3 // bStride = hP * l + bExtraStride // v0, v1, v2: A // v3, v4: B // v8 - v31: C @@ -45,7 +55,7 @@ cmp x10, #2 blt LH4 LH8: -// sub x14, x13, #160 +sub x14, x13, #128 LoopH: mov x15, x1 @@ -237,9 +247,26 @@ LoopH: sub x10, x10, #2 cmp x10, #2 - cbz x4, StoreLH8 + cbz x21, AddBiasLH8 + // add dst value + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + MNN_ADD_FLAOT v8, v9, v10, v11, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + MNN_ADD_FLAOT v12, v13, v14, v15, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], x14 + MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3 + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + MNN_ADD_FLAOT v20, v21, v22, v23, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + MNN_ADD_FLAOT v24, v25, v26, v27, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + MNN_ADD_FLAOT v28, v29, v30, v31, v0, v1, v2, v3 + sub x0, x0, #256 + sub x0, x0, x14 AddBiasLH8: + cbz x4, StoreLH8 ld1 {v5.4s}, [x4] dup v6.4s, v5.s[2] // Min Value dup v7.4s, v5.s[3] // Max Value @@ -460,8 +487,19 @@ LoopHRemain: LoopLREnd: - cbz x4, StoreLH4 + cbz x21, AddBiasLH4 + // add dst value + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + MNN_ADD_FLAOT v8, v9, v10, v11, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + MNN_ADD_FLAOT v12, v13, v14, v15, v0, v1, v2, v3 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + MNN_ADD_FLAOT v16, v17, v18, v19, v0, v1, v2, v3 + + sub x0, x0, #128 + AddBiasLH4: + cbz x4, StoreLH4 ld1 {v5.4s}, [x4] dup v6.4s, v5.s[2] // Min Value dup v7.4s, v5.s[3] // Max Value @@ -524,11 +562,12 @@ LoopHRemain: End: +ldp x21, x22, [sp, #80] ldp x19, x20, [sp, #64] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #32] ldp d12, d13, [sp, #16] -ldp d14, d15, [sp], #80 +ldp d14, d15, [sp], #96 ret diff --git a/source/backend/cpu/compute/CommonOptFunction.cpp b/source/backend/cpu/compute/CommonOptFunction.cpp index 72983f925..f9ce9567c 100644 --- a/source/backend/cpu/compute/CommonOptFunction.cpp +++ b/source/backend/cpu/compute/CommonOptFunction.cpp @@ -588,21 +588,15 @@ static void _MNNPackedMatMulRemain_int4(float* C, const float* A, const float* f auto hRemain = parameter[4]; float weightBytes = 0.5; // sizeof(int4_t) auto bExtraStride = static_cast(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; auto hC4 = UP_DIV(h, 4); - for (int y=0; y().max(); float maxValue = std::numeric_limits().max(); if (nullptr != postParameters) { minValue = postParameters[2]; maxValue = postParameters[3]; - alpha = postParameters[0]; - beta = postParameters[1]; } + int blockId = parameter[6]; for (int x=0; x 0) { + summer[0] = dstY[0]; + summer[1] = dstY[1]; + summer[2] = dstY[2]; + summer[3] = dstY[3]; + } + if (nullptr != bias && nullptr != postParameters) { for (int v=0; v<4; ++v) { - summer[v] = bias[4 * y + v]; + summer[v] += bias[4 * y + v]; } } for (int z=0; z(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; auto hC4 = UP_DIV(h, 4); - for (int y=0; y().max(); float maxValue = std::numeric_limits().max(); if (nullptr != postParameters) { minValue = postParameters[2]; maxValue = postParameters[3]; - alpha = postParameters[0]; - beta = postParameters[1]; } + int blockId = parameter[6]; for (int x=0; x 0) { + summer[0] = dstY[0]; + summer[1] = dstY[1]; + summer[2] = dstY[2]; + summer[3] = dstY[3]; + } + if (nullptr != bias && nullptr != postParameters) { for (int v=0; v<4; ++v) { - summer[v] = bias[4 * y + v]; + summer[v] += bias[4 * y + v]; } } for (int z=0; zcanUseInt4) { for (int i = 0; i < h; ++i) { int8Info->alpha.get()[i] *= -8.0; - core->MNNFp32ToLowp(int8Info->alpha.get(), reinterpret_cast(biasPtr), h); } + core->MNNFp32ToLowp(int8Info->alpha.get(), reinterpret_cast(biasPtr), h); } } } else { @@ -86,6 +86,19 @@ bool ConvolutionHybrid::initQuantizeResource(std::shared_ptr data(weightLength, 0); + auto srcWInt8 = int8Info->weight.get(); + if (hP * hU != outputCount || lP * lU != srcChannel) { + int packedic = lU * lP; + for (int i = 0; i < outputCount; ++i) { + for (int j = 0; j < srcChannel; ++j) { + int destIdx = i * packedic + j; + int srcIdx = i * srcChannel + j; + data[destIdx] = srcWInt8[srcIdx]; + } + } + srcWInt8 = data.data(); + } if (int8Info->canUseInt4) { MNN_ASSERT(weightLength % 2 == 0); weightLength = UP_DIV(weightLength, 2); @@ -101,10 +114,10 @@ bool ConvolutionHybrid::initQuantizeResource(std::shared_ptrmWeight->host(); - auto srcWInt8 = int8Info->weight.get(); // oc, ic -> oc/hP, ic/lP, hP, lP for (int i = 0; i < hU; i++) { for (int j = 0; j < lU; j++) { for (int k = 0; k < hP; k++) { for (int l = 0; l < lP; l++) { - dstWInt8[i * srcChannel * hP + j * hP * lP + k * lP + l] = srcWInt8[(i * hP + k) * srcChannel + (j * lP + l)]; + dstWInt8[i * lU * lP * hP + j * hP * lP + k * lP + l] = srcWInt8[(i * hP + k) * lP * lU + (j * lP + l)]; } } } diff --git a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp index 10df5d863..f844d7d5b 100644 --- a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp +++ b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp @@ -17,7 +17,7 @@ #include "math/Vec.hpp" #include "core/BufferAllocator.hpp" #include "core/MemoryFormater.h" -#define PARAMETERSIZE 6 +#define PARAMETERSIZE 7 using Vec4 = MNN::Math::Vec; namespace MNN { @@ -60,7 +60,15 @@ bool DenseConvolutionTiledExecutor::initQuantizeResource(std::shared_ptrmDequantize.mScaleBias.reset(MNN::Tensor::createDevice({hU * hP * 2})); + int dequantCnt = int8Info->alpha.size(); + int scaleSize = dequantCnt; // real size + if (int8Info->asymmetric) { + scaleSize = dequantCnt / 2; + + } + int blockNum = scaleSize / outputCount; + scaleSize = blockNum * hU * hP; // pack size + resource->mDequantize.mScaleBias.reset(MNN::Tensor::createDevice({scaleSize * 2 * bytes})); res = resource->backend->onAcquireBuffer(resource->mDequantize.mScaleBias.get(), Backend::STATIC); if (!res) { return false; @@ -88,36 +96,56 @@ bool DenseConvolutionTiledExecutor::initQuantizeResource(std::shared_ptrmWeight = weightLow; } auto alphaPtr = resource->mDequantize.mScaleBias->host(); - auto biasPtr = reinterpret_cast(reinterpret_cast(alphaPtr) + hU * hP * bytes); - ::memset(alphaPtr, 0, 2 * hU * hP * bytes); + auto biasPtr = reinterpret_cast(reinterpret_cast(alphaPtr) + scaleSize * bytes); + ::memset(alphaPtr, 0, 2 * scaleSize * bytes); int h = int8Info->alpha.size(); if (bytes == 2) { auto core = static_cast(resource->backend)->functions(); - std::vector tmpAlpha(hU*hP*2, 0.0f); + std::vector tmpAlpha(scaleSize * 2, 0.0f); if (int8Info->asymmetric) { - int hh = h / 2; - for (int i=0; ialpha.get()[2 * i + 1]; - tmpAlpha[i+hU*hP] = int8Info->alpha.get()[2 * i] + (float)originOffset * int8Info->alpha.get()[2 * i + 1]; + for (int i = 0; i < blockNum; ++i) { + auto dstAlpha = tmpAlpha.data() + i * hU * hP; + auto srcAlpha = int8Info->alpha.get(); + for (int j = 0; j < outputCount; ++j) { + int scaleIndex = j * blockNum + i; + dstAlpha[j] = srcAlpha[2 * scaleIndex + 1]; + dstAlpha[j + scaleSize] = srcAlpha[2 * scaleIndex] + (float)originOffset * dstAlpha[j]; + } } } else { - for (int i=0; ialpha.get()[i]; - tmpAlpha[i+hU*hP] = (float)originOffset * int8Info->alpha.get()[i]; + for (int i = 0; i < blockNum; ++i) { + auto dstAlpha = tmpAlpha.data() + i * hU * hP; + auto srcAlpha = int8Info->alpha.get(); + for (int j = 0; j < outputCount; ++j) { + int scaleIndex = j * blockNum + i; + dstAlpha[j] = srcAlpha[scaleIndex]; + dstAlpha[j + scaleSize] = (float)originOffset * dstAlpha[j]; + } } } - core->MNNFp32ToLowp(tmpAlpha.data(), reinterpret_cast(alphaPtr), hU*hP*2); + core->MNNFp32ToLowp(tmpAlpha.data(), reinterpret_cast(alphaPtr), scaleSize * 2); } else { if (int8Info->asymmetric) { - int hh = h / 2; - for (int i=0; ialpha.get()[2 * i + 1]; - biasPtr[i] = int8Info->alpha.get()[2 * i] + (float)originOffset * alphaPtr[i]; + for (int i = 0; i < blockNum; ++i) { + auto dstAlpha = alphaPtr + i * hU * hP; + auto dstBias = biasPtr + i * hU * hP; + auto srcAlpha = int8Info->alpha.get(); + for (int j = 0; j < outputCount; ++j) { + int scaleIndex = j * blockNum + i; + dstAlpha[j] = srcAlpha[2 * scaleIndex + 1]; + dstBias[j] = srcAlpha[2 * scaleIndex] + (float)originOffset * dstAlpha[j]; + } } } else { - for (int i=0; ialpha.get()[i]; - biasPtr[i] = 0.f + (float)originOffset * alphaPtr[i]; + for (int i = 0; i < blockNum; ++i) { + auto dstAlpha = alphaPtr + i * hU * hP; + auto dstBias = biasPtr + i * hU * hP; + auto srcAlpha = int8Info->alpha.get(); + for (int j = 0; j < outputCount; ++j) { + int scaleIndex = j * blockNum + i; + dstAlpha[j] = srcAlpha[scaleIndex]; + dstBias[j] = (float)originOffset * dstAlpha[j]; + } } } } @@ -435,11 +463,27 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs auto weightType = weight->getType(); const uint8_t* dequantAlpha = nullptr; const uint8_t* dequantBias = nullptr; + auto ic = input->channel(); + auto icC4 = UP_DIV(ic, unit); + auto L = ic * mCommon->kernelY() * mCommon->kernelX(); + auto tileC = std::max(unit, hP); + int blockSize = L; + int blockNum = 1; + float halfStride = 1; + size_t weightStride = 0; #ifdef MNN_LOW_MEMORY if (mResource && mResource->mDequantize.bits <= 8) { DenseConvolutionTiledExecutor::selectLowMemoryMatmulFunc(&matmulUnit, &matmulRemain, &weightBytes, mResource->mDequantize.bits, core); + int scaleSize = mResource->mDequantize.mScaleBias->size() / (2 * bytes); + blockNum = scaleSize / (mResource->hU * mResource->hP); + blockSize /= blockNum; dequantAlpha = mResource->mDequantize.mScaleBias->host(); - dequantBias = dequantAlpha + mResource->hU * mResource->hP * bytes; + dequantBias = dequantAlpha + scaleSize * bytes; + weightStride = (L - blockSize) * hP; + if (mResource->mDequantize.bits == 4) { + halfStride = 0.5; + weightStride = static_cast(weightStride * halfStride); + } } #endif auto kernel_width = mCommon->kernelX(); @@ -447,14 +491,12 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs auto output = outputs[0]; auto batch = output->batch(); int threadNumber = ((CPUBackend *)backend())->threadNumber(); - auto icC4 = UP_DIV(input->channel(), unit); - auto ic = input->channel(); - auto L = ic * mCommon->kernelY() * mCommon->kernelX(); + int LRoundup = ROUND_UP(L, lP); int LRoundupC4 = UP_DIV(LRoundup, unit); auto outputChannel = output->channel(); - auto tileC = std::max(unit, hP); auto oC4 = UP_DIV(outputChannel, tileC); + auto ocUp4 = ROUND_UP(outputChannel, hP); auto kernelSize = mCommon->kernelX() * mCommon->kernelY(); ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParameters, mCommon, input, output, mPadX, mPadY, core, nullptr); @@ -507,11 +549,12 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs size_t shapeParameters[PARAMETERSIZE]; size_t* parameters = shapeParameters; parameters[0] = eP * bytes; - parameters[1] = L; + parameters[1] = blockSize; parameters[2] = outputChannel; parameters[3] = plane * unit * bytes; parameters[4] = 0; - parameters[5] = 0; + parameters[5] = weightStride; // Only used when block quant + parameters[6] = 0; #ifdef PROFILE_DETAIL std::vector durationMul(threadNumberFirst, 0); @@ -572,9 +615,24 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs auto _weightFloatPtr = reinterpret_cast(weightPtr + int((ocIndex / hP * LRoundup * hP) * weightBytes)); auto _biasFloatPtr = reinterpret_cast(reinterpret_cast(biasPtr) + ocIndex * bytes); paraParameters[2] = std::min(outputChannel - ocIndex, tileC); - auto k = reinterpret_cast(dequantAlpha + ocIndex * bytes); - auto b = reinterpret_cast(dequantBias + ocIndex * bytes); - matmulUnit(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, paraParameters, postParameters.data(), _biasFloatPtr, k, b); + auto k = reinterpret_cast(dequantAlpha + ocIndex * bytes); + auto b = reinterpret_cast(dequantBias + ocIndex * bytes); + const float* relufp32 = nullptr; + const float* exeBiasPtr = nullptr; + int finishedL = 0; + int wquantStride = 0; + auto _weightPtr = reinterpret_cast(_weightFloatPtr); + uint8_t* _APtr = reinterpret_cast(gemmBuffer); + for (int bk = 0; bk < blockNum; ++bk) { + paraParameters[6] = bk; + if (bk == blockNum - 1) { + relufp32 = postParameters.data(); + exeBiasPtr = _biasFloatPtr; + } + finishedL = blockSize * bk; + wquantStride = static_cast(blockSize * bk * hP * halfStride); + matmulUnit(_dstFloatPtr, (float*)(_APtr + eP * finishedL * bytes), (float*)(_weightPtr + wquantStride), paraParameters, relufp32, exeBiasPtr, (float*)(k + bk * ocUp4 * bytes), (float*)(b + bk * ocUp4 * bytes)); + } } } MNN_CONCURRENCY_END(); @@ -588,9 +646,24 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs auto _weightFloatPtr = reinterpret_cast(weightPtr + int((ocIndex / hP * LRoundup * hP) * weightBytes)); auto _biasFloatPtr = reinterpret_cast(reinterpret_cast(biasPtr) + ocIndex * bytes); paraParameters[2] = std::min(outputChannel - ocIndex, tileC); - auto k = reinterpret_cast(dequantAlpha + ocIndex * bytes); - auto b = reinterpret_cast(dequantBias + ocIndex * bytes); - matmulRemain(_dstFloatPtr, (float*)gemmBuffer, _weightFloatPtr, xC, paraParameters, postParameters.data(), _biasFloatPtr, k, b); + auto k = reinterpret_cast(dequantAlpha + ocIndex * bytes); + auto b = reinterpret_cast(dequantBias + ocIndex * bytes); + const float* relufp32 = nullptr; + const float* exeBiasPtr = nullptr; + int finishedL = 0; + int wquantStride = 0; + const int8_t* _weightPtr = reinterpret_cast(_weightFloatPtr); + uint8_t* _APtr = reinterpret_cast(gemmBuffer); + for (int bk = 0; bk < blockNum; ++bk) { + paraParameters[6] = bk; + if (bk == blockNum - 1) { + relufp32 = postParameters.data(); + exeBiasPtr = _biasFloatPtr; + } + finishedL = blockSize * bk; + wquantStride = static_cast(blockSize * bk * hP * halfStride); + matmulRemain(_dstFloatPtr, (float*)(_APtr + eP * finishedL * bytes), (float*)(_weightPtr + wquantStride), xC, paraParameters, relufp32, exeBiasPtr, (float*)(k + bk * ocUp4 * bytes), (float*)(b + bk * ocUp4 * bytes)); + } } } MNN_CONCURRENCY_END(); @@ -633,11 +706,12 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs info[3] = mIm2ColParameters.strideX; size_t parameters[PARAMETERSIZE]; parameters[0] = eP * bytes; - parameters[1] = L; + parameters[1] = blockSize; parameters[2] = outputChannel; parameters[3] = plane * unit * bytes; parameters[4] = 0; - parameters[5] = 0; + parameters[5] = weightStride; // Only used when block quant + parameters[6] = 0; #ifdef PROFILE_DETAIL std::vector durationMul(threadNumberFirst, 0); @@ -673,13 +747,38 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs packATime[tId] += timer[tId].durationInUs(); timer[tId].reset(); #endif - auto k = reinterpret_cast(dequantAlpha); - auto b = reinterpret_cast(dequantBias); + int finishedL = 0; + int wquantStride = 0; + int8_t* _weightPtr = reinterpret_cast(weightPtr); auto _dstFloatPtr = reinterpret_cast(dstOrigin + start * unit * bytes); + const float* relufp32 = nullptr; + const float* exeBiasPtr = nullptr; if (xC == eP) { - matmulUnit(_dstFloatPtr, (float*)gemmBuffer, (float*)weightPtr, parameters, postParameters.data(), biasPtr, k, b); + // matmulUnit(_dstFloatPtr, (float*)gemmBuffer, (float*)weightPtr, parameters, postParameters.data(), biasPtr, k, b); + for (int bk = 0; bk < blockNum; ++bk) { + parameters[6] = bk; + if (bk == blockNum - 1) { + relufp32 = postParameters.data(); + exeBiasPtr = biasPtr; + } + finishedL = blockSize * bk; + wquantStride = static_cast(blockSize * bk * hP * halfStride); + + matmulUnit(_dstFloatPtr, (float*)(gemmBuffer + bytes * eP * finishedL), (float*)(_weightPtr + wquantStride), parameters, relufp32, exeBiasPtr, (float*)(dequantAlpha + bk * ocUp4 * bytes), (float*)(dequantBias + bk * ocUp4 * bytes)); + } } else { - matmulRemain(_dstFloatPtr, (float*)gemmBuffer, (float*)weightPtr, xC, parameters, postParameters.data(), biasPtr, k, b); + for (int bk = 0; bk < blockNum; ++bk) { + parameters[6] = bk; + if (bk == blockNum - 1) { + relufp32 = postParameters.data(); + exeBiasPtr = biasPtr; + } + finishedL = blockSize * bk; + wquantStride = static_cast(blockSize * bk * hP * halfStride); + + matmulRemain(_dstFloatPtr, (float*)(gemmBuffer + eP * bytes * finishedL), (float*)(_weightPtr + wquantStride), xC, parameters, relufp32, exeBiasPtr, (float*)(dequantAlpha + bk * ocUp4 * bytes), (float*)(dequantBias + bk * ocUp4 * bytes )); + } + // matmulRemain(_dstFloatPtr, (float*)gemmBuffer, (float*)weightPtr, xC, parameters, postParameters.data(), biasPtr, k, b); } #ifdef PROFILE_DETAIL diff --git a/source/backend/cpu/compute/StrassenMatmulComputor.cpp b/source/backend/cpu/compute/StrassenMatmulComputor.cpp index 04407b60c..094efbd5a 100644 --- a/source/backend/cpu/compute/StrassenMatmulComputor.cpp +++ b/source/backend/cpu/compute/StrassenMatmulComputor.cpp @@ -94,13 +94,14 @@ ErrorCode StrassenMatrixComputor::_generateTrivalMatMul(int e, int l, int h, con mFunctions.emplace_back( std::make_pair([cStride, l, h, xCount, AT, BT, CT, COT, tileBufferBasic, unitNumber, bExtraStride, numberThread, eReal, eP, active, matmulUnit, matmulRemain, dequantAlpha, dequantBias, this](int tId) { auto core = static_cast(backend())->functions(); - size_t parameters[6]; + size_t parameters[7]; parameters[0] = xCount * core->bytes; parameters[1] = l; parameters[2] = h; parameters[3] = cStride; parameters[4] = 0; parameters[5] = bExtraStride; + parameters[6] = 0; auto tileHost = tileBufferBasic.ptr() + eP * parameters[1] * tId * core->bytes; const float* postParametersPtr = nullptr; if (!active.empty()) { diff --git a/source/backend/cpu/x86_x64/avx/GemmAVX2.cpp b/source/backend/cpu/x86_x64/avx/GemmAVX2.cpp index 56eaf70fb..2214e1688 100644 --- a/source/backend/cpu/x86_x64/avx/GemmAVX2.cpp +++ b/source/backend/cpu/x86_x64/avx/GemmAVX2.cpp @@ -35,22 +35,30 @@ void _AVX_MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t void _AVX_MNNPackedMatMul_int4(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b) { _AVX_MNNPackedMatMul_Main_int4(C, A, B, parameter, k, b); - AVX2GemmPostTreat(C, MNN_UNIT_E, parameter, postParameters, bias); + if (nullptr != bias) { + AVX2GemmPostTreat(C, MNN_UNIT_E, parameter, postParameters, bias); + } } void _AVX_MNNPackedMatMulRemain_int4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b) { _AVX_MNNPackednMatMulRemainCommon_int4(C, A, B, eSize, parameter, k, b); - AVX2GemmPostTreat(C, eSize, parameter, postParameters, bias); + if (nullptr != bias) { + AVX2GemmPostTreat(C, eSize, parameter, postParameters, bias); + } } void _AVX_MNNPackedMatMul_int8(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b) { _AVX_MNNPackedMatMul_Main_int8(C, A, B, parameter, k, b); - AVX2GemmPostTreat(C, MNN_UNIT_E, parameter, postParameters, bias); + if (nullptr != bias) { + AVX2GemmPostTreat(C, MNN_UNIT_E, parameter, postParameters, bias); + } } void _AVX_MNNPackedMatMulRemain_int8(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b) { _AVX_MNNPackednMatMulRemainCommon_int8(C, A, B, eSize, parameter, k, b); - AVX2GemmPostTreat(C, eSize, parameter, postParameters, bias); + if (nullptr != bias) { + AVX2GemmPostTreat(C, eSize, parameter, postParameters, bias); + } } static __m128i _load_int4_to_int8(const uint8_t* src) { uint8_t c = 0xf; diff --git a/source/backend/cpu/x86_x64/avx/GemmFunction.hpp b/source/backend/cpu/x86_x64/avx/GemmFunction.hpp index 08dfdfef9..bf299722c 100644 --- a/source/backend/cpu/x86_x64/avx/GemmFunction.hpp +++ b/source/backend/cpu/x86_x64/avx/GemmFunction.hpp @@ -20,6 +20,27 @@ STORE_4(dst + 8 * (3 + 4 * u + 8 * v), m3); \ } +#define FMLA_TRANSPOSE_SAVE(u, v, z0, z3, z6, z9) \ + { \ + auto tmp_m0 = LOAD4(dst + 8 * (0 + 4 * u + 8 * v)); \ + auto tmp_m1 = LOAD4(dst + 8 * (1 + 4 * u + 8 * v)); \ + auto tmp_m2 = LOAD4(dst + 8 * (2 + 4 * u + 8 * v)); \ + auto tmp_m3 = LOAD4(dst + 8 * (3 + 4 * u + 8 * v)); \ + auto m0 = _mm256_extractf128_ps(z0, u); \ + auto m1 = _mm256_extractf128_ps(z3, u); \ + auto m2 = _mm256_extractf128_ps(z6, u); \ + auto m3 = _mm256_extractf128_ps(z9, u); \ + _MM_TRANSPOSE4_PS(m0, m1, m2, m3); \ + m0 = _mm_add_ps(tmp_m0, m0); \ + m1 = _mm_add_ps(tmp_m1, m1); \ + m2 = _mm_add_ps(tmp_m2, m2); \ + m3 = _mm_add_ps(tmp_m3, m3); \ + STORE_4(dst + 8 * (0 + 4 * u + 8 * v), m0); \ + STORE_4(dst + 8 * (1 + 4 * u + 8 * v), m1); \ + STORE_4(dst + 8 * (2 + 4 * u + 8 * v), m2); \ + STORE_4(dst + 8 * (3 + 4 * u + 8 * v), m3); \ + } + namespace { static inline __m128i mm_loadu_si128(const void* addr) { return _mm_castps_si128(LOAD4((const float*)addr)); @@ -858,9 +879,10 @@ static void _AVX_MNNPackedMatMul_Main_int4(TYPE* C, const TYPE* A, const TYPE* f auto cStride = parameter[3] / sizeof(TYPE); float weightBytes = 0.5; // sizeof(int4_t) auto bExtraStride = static_cast(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; auto hC4 = UP_DIV(h, 4); float ws_tmp[4]; + size_t blockId = parameter[6]; for (int y = 0; y < hC4; ++y) { auto weight = B + y * bStride / 2; auto dst = C + (y / 2) * cStride + 4 * (y % 2); @@ -911,12 +933,21 @@ static void _AVX_MNNPackedMatMul_Main_int4(TYPE* C, const TYPE* A, const TYPE* f z10 = MNNAVXFMA(s1, w3, z10); z11 = MNNAVXFMA(s2, w3, z11); } - TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); - TRANPOSE_SAVE(1, 0, z0, z3, z6, z9); - TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); - TRANPOSE_SAVE(1, 1, z1, z4, z7, z10); - TRANPOSE_SAVE(0, 2, z2, z5, z8, z11); - TRANPOSE_SAVE(1, 2, z2, z5, z8, z11); + if (blockId == 0) { + TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(1, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + TRANPOSE_SAVE(1, 1, z1, z4, z7, z10); + TRANPOSE_SAVE(0, 2, z2, z5, z8, z11); + TRANPOSE_SAVE(1, 2, z2, z5, z8, z11); + } else { + FMLA_TRANSPOSE_SAVE(0, 0, z0, z3, z6, z9); + FMLA_TRANSPOSE_SAVE(1, 0, z0, z3, z6, z9); + FMLA_TRANSPOSE_SAVE(0, 1, z1, z4, z7, z10); + FMLA_TRANSPOSE_SAVE(1, 1, z1, z4, z7, z10); + FMLA_TRANSPOSE_SAVE(0, 2, z2, z5, z8, z11); + FMLA_TRANSPOSE_SAVE(1, 2, z2, z5, z8, z11); + } } } @@ -929,7 +960,8 @@ static void _AVX_MNNPackedMatMul_int4_20(TYPE* C, const TYPE* A, const uint8_t* auto cStride = parameter[3] / sizeof(TYPE); float weightBytes = 0.5; // sizeof(int4_t) auto bExtraStride = static_cast(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); float ws_tmp[4]; for (int y = 0; y < hC4; ++y) { @@ -981,11 +1013,19 @@ static void _AVX_MNNPackedMatMul_int4_20(TYPE* C, const TYPE* A, const uint8_t* z10 = MNNAVXFMA(s1, w3, z10); z11 = MNNAVXFMA(s2, w3, z11); } - TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); - TRANPOSE_SAVE(1, 0, z0, z3, z6, z9); - TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); - TRANPOSE_SAVE(1, 1, z1, z4, z7, z10); - TRANPOSE_SAVE(0, 2, z2, z5, z8, z11); + if (0 == blockId) { + TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(1, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + TRANPOSE_SAVE(1, 1, z1, z4, z7, z10); + TRANPOSE_SAVE(0, 2, z2, z5, z8, z11); + } else { + FMLA_TRANSPOSE_SAVE(0, 0, z0, z3, z6, z9); + FMLA_TRANSPOSE_SAVE(1, 0, z0, z3, z6, z9); + FMLA_TRANSPOSE_SAVE(0, 1, z1, z4, z7, z10); + FMLA_TRANSPOSE_SAVE(1, 1, z1, z4, z7, z10); + FMLA_TRANSPOSE_SAVE(0, 2, z2, z5, z8, z11); + } } } @@ -997,7 +1037,8 @@ static void _AVX_MNNPackedMatMul_int4_16(TYPE* C, const TYPE* A, const uint8_t* auto cStride = parameter[3] / sizeof(TYPE); float weightBytes = 0.5; // sizeof(int4_t) auto bExtraStride = static_cast(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); float ws_tmp[4]; for (int y = 0; y < hC4; ++y) { @@ -1039,10 +1080,17 @@ static void _AVX_MNNPackedMatMul_int4_16(TYPE* C, const TYPE* A, const uint8_t* z9 = MNNAVXFMA(s0, w3, z9); z10 = MNNAVXFMA(s1, w3, z10); } - TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); - TRANPOSE_SAVE(1, 0, z0, z3, z6, z9); - TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); - TRANPOSE_SAVE(1, 1, z1, z4, z7, z10); + if (0 == blockId) { + TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(1, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + TRANPOSE_SAVE(1, 1, z1, z4, z7, z10); + } else { + FMLA_TRANSPOSE_SAVE(0, 0, z0, z3, z6, z9); + FMLA_TRANSPOSE_SAVE(1, 0, z0, z3, z6, z9); + FMLA_TRANSPOSE_SAVE(0, 1, z1, z4, z7, z10); + FMLA_TRANSPOSE_SAVE(1, 1, z1, z4, z7, z10); + } } } @@ -1054,7 +1102,8 @@ static void _AVX_MNNPackedMatMul_int4_5(TYPE* C, const TYPE* A, const uint8_t* B auto cStride = parameter[3] / sizeof(TYPE); float weightBytes = 0.5; auto bExtraStride = static_cast(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); int lC4 = l / 4; int lR = lC4 * 4; @@ -1115,17 +1164,53 @@ static void _AVX_MNNPackedMatMul_int4_5(TYPE* C, const TYPE* A, const uint8_t* B weight2 += 2; weight3 += 2; } - STORE_8(dst0, sumAvx00); - STORE_8(dst0 + 8, sumAvx10); - STORE_8(dst0 + 16, sumAvx20); - STORE_8(dst0 + 24, sumAvx30); - STORE_8(dst0 + 32, sumAvx40); - - STORE_8(dst2, sumAvx01); - STORE_8(dst2 + 8, sumAvx11); - STORE_8(dst2 + 16, sumAvx21); - STORE_8(dst2 + 24, sumAvx31); - STORE_8(dst2 + 32, sumAvx41); + if (0 == blockId) { + STORE_8(dst0, sumAvx00); + STORE_8(dst0 + 8, sumAvx10); + STORE_8(dst0 + 16, sumAvx20); + STORE_8(dst0 + 24, sumAvx30); + STORE_8(dst0 + 32, sumAvx40); + + STORE_8(dst2, sumAvx01); + STORE_8(dst2 + 8, sumAvx11); + STORE_8(dst2 + 16, sumAvx21); + STORE_8(dst2 + 24, sumAvx31); + STORE_8(dst2 + 32, sumAvx41); + } else { + auto tmp0 = LOAD8(dst0); + auto tmp1 = LOAD8(dst0 + 8); + auto tmp2 = LOAD8(dst0 + 16); + auto tmp3 = LOAD8(dst0 + 24); + auto tmp4 = LOAD8(dst0 + 32); + auto tmp5 = LOAD8(dst2); + auto tmp6 = LOAD8(dst2 + 8); + auto tmp7 = LOAD8(dst2 + 16); + auto tmp8 = LOAD8(dst2 + 24); + auto tmp9 = LOAD8(dst2 + 32); + + sumAvx00 = _mm256_add_ps(sumAvx00, tmp0); + sumAvx10 = _mm256_add_ps(sumAvx10, tmp1); + sumAvx20 = _mm256_add_ps(sumAvx20, tmp2); + sumAvx30 = _mm256_add_ps(sumAvx30, tmp3); + sumAvx40 = _mm256_add_ps(sumAvx40, tmp4); + sumAvx01 = _mm256_add_ps(sumAvx01, tmp5); + sumAvx11 = _mm256_add_ps(sumAvx11, tmp6); + sumAvx21 = _mm256_add_ps(sumAvx21, tmp7); + sumAvx31 = _mm256_add_ps(sumAvx31, tmp8); + sumAvx41 = _mm256_add_ps(sumAvx41, tmp9); + + STORE_8(dst0, sumAvx00); + STORE_8(dst0 + 8, sumAvx10); + STORE_8(dst0 + 16, sumAvx20); + STORE_8(dst0 + 24, sumAvx30); + STORE_8(dst0 + 32, sumAvx40); + + STORE_8(dst2, sumAvx01); + STORE_8(dst2 + 8, sumAvx11); + STORE_8(dst2 + 16, sumAvx21); + STORE_8(dst2 + 24, sumAvx31); + STORE_8(dst2 + 32, sumAvx41); + } } for (int y = hR; y < hC4; ++y) { auto weight = B + y * bStride / 2; @@ -1157,11 +1242,31 @@ static void _AVX_MNNPackedMatMul_int4_5(TYPE* C, const TYPE* A, const uint8_t* B z3 = MNNSSEFMA(s3, w0, z3); z4 = MNNSSEFMA(s4, w0, z4); } - STORE_4(dst + 8 * 0, z0); - STORE_4(dst + 8 * 1, z1); - STORE_4(dst + 8 * 2, z2); - STORE_4(dst + 8 * 3, z3); - STORE_4(dst + 8 * 4, z4); + if (0 == blockId) { + STORE_4(dst + 8 * 0, z0); + STORE_4(dst + 8 * 1, z1); + STORE_4(dst + 8 * 2, z2); + STORE_4(dst + 8 * 3, z3); + STORE_4(dst + 8 * 4, z4); + } else { + auto tmp0 = LOAD4(dst + 8 * 0); + auto tmp1 = LOAD4(dst + 8 * 1); + auto tmp2 = LOAD4(dst + 8 * 2); + auto tmp3 = LOAD4(dst + 8 * 3); + auto tmp4 = LOAD4(dst + 8 * 4); + + z0 = _mm_add_ps(tmp0, z0); + z1 = _mm_add_ps(tmp1, z1); + z2 = _mm_add_ps(tmp2, z2); + z3 = _mm_add_ps(tmp3, z3); + z4 = _mm_add_ps(tmp4, z4); + + STORE_4(dst + 8 * 0, z0); + STORE_4(dst + 8 * 1, z1); + STORE_4(dst + 8 * 2, z2); + STORE_4(dst + 8 * 3, z3); + STORE_4(dst + 8 * 4, z4); + } } } @@ -1174,7 +1279,8 @@ static void _AVX_MNNPackedMatMul_int4_4(TYPE* C, const TYPE* A, const uint8_t* B auto cStride = parameter[3] / sizeof(TYPE); float weightBytes = 0.5; // sizeof(int4_t) auto bExtraStride = static_cast(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); int lC4 = l / 4; int lR = lC4 * 4; @@ -1229,15 +1335,47 @@ static void _AVX_MNNPackedMatMul_int4_4(TYPE* C, const TYPE* A, const uint8_t* B weight2 += 2; weight3 += 2; } - STORE_8(dst0, sumAvx00); - STORE_8(dst0 + 8, sumAvx10); - STORE_8(dst0 + 16, sumAvx20); - STORE_8(dst0 + 24, sumAvx30); - - STORE_8(dst2, sumAvx01); - STORE_8(dst2 + 8, sumAvx11); - STORE_8(dst2 + 16, sumAvx21); - STORE_8(dst2 + 24, sumAvx31); + if (0 == blockId) { + STORE_8(dst0, sumAvx00); + STORE_8(dst0 + 8, sumAvx10); + STORE_8(dst0 + 16, sumAvx20); + STORE_8(dst0 + 24, sumAvx30); + + STORE_8(dst2, sumAvx01); + STORE_8(dst2 + 8, sumAvx11); + STORE_8(dst2 + 16, sumAvx21); + STORE_8(dst2 + 24, sumAvx31); + } else { + auto tmp0 = LOAD8(dst0); + auto tmp1 = LOAD8(dst0 + 8); + auto tmp2 = LOAD8(dst0 + 16); + auto tmp3 = LOAD8(dst0 + 24); + + auto tmp5 = LOAD8(dst2); + auto tmp6 = LOAD8(dst2 + 8); + auto tmp7 = LOAD8(dst2 + 16); + auto tmp8 = LOAD8(dst2 + 24); + + sumAvx00 = _mm256_add_ps(sumAvx00, tmp0); + sumAvx10 = _mm256_add_ps(sumAvx10, tmp1); + sumAvx20 = _mm256_add_ps(sumAvx20, tmp2); + sumAvx30 = _mm256_add_ps(sumAvx30, tmp3); + + sumAvx01 = _mm256_add_ps(sumAvx01, tmp5); + sumAvx11 = _mm256_add_ps(sumAvx11, tmp6); + sumAvx21 = _mm256_add_ps(sumAvx21, tmp7); + sumAvx31 = _mm256_add_ps(sumAvx31, tmp8); + + STORE_8(dst0, sumAvx00); + STORE_8(dst0 + 8, sumAvx10); + STORE_8(dst0 + 16, sumAvx20); + STORE_8(dst0 + 24, sumAvx30); + + STORE_8(dst2, sumAvx01); + STORE_8(dst2 + 8, sumAvx11); + STORE_8(dst2 + 16, sumAvx21); + STORE_8(dst2 + 24, sumAvx31); + } } float ws_tmp[4]; for (int y = hR; y < hC4; ++y) { @@ -1271,10 +1409,28 @@ static void _AVX_MNNPackedMatMul_int4_4(TYPE* C, const TYPE* A, const uint8_t* B z9 = MNNSSEFMA(s0, w3, z9); } _MM_TRANSPOSE4_PS(z0, z3, z6, z9); - STORE_4(dst + 8 * 0, z0); - STORE_4(dst + 8 * 1, z3); - STORE_4(dst + 8 * 2, z6); - STORE_4(dst + 8 * 3, z9); + if (0 == blockId) { + STORE_4(dst + 8 * 0, z0); + STORE_4(dst + 8 * 1, z3); + STORE_4(dst + 8 * 2, z6); + STORE_4(dst + 8 * 3, z9); + } else { + + auto tmp0 = LOAD4(dst + 8 * 0); + auto tmp1 = LOAD4(dst + 8 * 1); + auto tmp2 = LOAD4(dst + 8 * 2); + auto tmp3 = LOAD4(dst + 8 * 3); + + z0 = _mm_add_ps(tmp0, z0); + z3 = _mm_add_ps(tmp1, z3); + z6 = _mm_add_ps(tmp2, z6); + z9 = _mm_add_ps(tmp3, z9); + + STORE_4(dst + 8 * 0, z0); + STORE_4(dst + 8 * 1, z3); + STORE_4(dst + 8 * 2, z6); + STORE_4(dst + 8 * 3, z9); + } } } template @@ -1285,7 +1441,8 @@ static void _AVX_MNNPackedMatMul_int4_3(TYPE* C, const TYPE* A, const uint8_t* B auto cStride = parameter[3] / sizeof(TYPE); float weightBytes = 0.5; // sizeof(int4_t) auto bExtraStride = static_cast(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); int lC4 = l / 4; int lR = lC4 * 4; @@ -1333,21 +1490,78 @@ static void _AVX_MNNPackedMatMul_int4_3(TYPE* C, const TYPE* A, const uint8_t* B weight2 += 2; weight3 += 2; } - STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0)); - STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx10, 0)); - STORE_4(dst0 + 16, _mm256_extractf128_ps(sumAvx20, 0)); - - STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1)); - STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx10, 1)); - STORE_4(dst1 + 16, _mm256_extractf128_ps(sumAvx20, 1)); - - STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0)); - STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx11, 0)); - STORE_4(dst2 + 16, _mm256_extractf128_ps(sumAvx21, 0)); - - STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1)); - STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx11, 1)); - STORE_4(dst3 + 16, _mm256_extractf128_ps(sumAvx21, 1)); + if (0 == blockId) { + STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0)); + STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx10, 0)); + STORE_4(dst0 + 16, _mm256_extractf128_ps(sumAvx20, 0)); + + STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1)); + STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx10, 1)); + STORE_4(dst1 + 16, _mm256_extractf128_ps(sumAvx20, 1)); + + STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0)); + STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx11, 0)); + STORE_4(dst2 + 16, _mm256_extractf128_ps(sumAvx21, 0)); + + STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1)); + STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx11, 1)); + STORE_4(dst3 + 16, _mm256_extractf128_ps(sumAvx21, 1)); + } else { + auto tmp00 = LOAD4(dst0 + 0); + auto tmp01 = LOAD4(dst0 + 8); + auto tmp02 = LOAD4(dst0 + 16); + + auto tmp10 = LOAD4(dst1 + 0); + auto tmp11 = LOAD4(dst1 + 8); + auto tmp12 = LOAD4(dst1 + 16); + + auto tmp20 = LOAD4(dst2 + 0); + auto tmp21 = LOAD4(dst2 + 8); + auto tmp22 = LOAD4(dst2 + 16); + + auto tmp30 = LOAD4(dst3 + 0); + auto tmp31 = LOAD4(dst3 + 8); + auto tmp32 = LOAD4(dst3 + 16); + + auto sum_tmp00 = _mm256_extractf128_ps(sumAvx00, 0); + auto sum_tmp01 = _mm256_extractf128_ps(sumAvx10, 0); + auto sum_tmp02 = _mm256_extractf128_ps(sumAvx20, 0); + auto sum_tmp10 = _mm256_extractf128_ps(sumAvx00, 1); + auto sum_tmp11 = _mm256_extractf128_ps(sumAvx10, 1); + auto sum_tmp12 = _mm256_extractf128_ps(sumAvx20, 1); + auto sum_tmp20 = _mm256_extractf128_ps(sumAvx01, 0); + auto sum_tmp21 = _mm256_extractf128_ps(sumAvx11, 0); + auto sum_tmp22 = _mm256_extractf128_ps(sumAvx21, 0); + auto sum_tmp30 = _mm256_extractf128_ps(sumAvx01, 1); + auto sum_tmp31 = _mm256_extractf128_ps(sumAvx11, 1); + auto sum_tmp32 = _mm256_extractf128_ps(sumAvx21, 1); + + sum_tmp00 = _mm_add_ps(tmp00, sum_tmp00); + sum_tmp01 = _mm_add_ps(tmp01, sum_tmp01); + sum_tmp02 = _mm_add_ps(tmp02, sum_tmp02); + sum_tmp10 = _mm_add_ps(tmp10, sum_tmp10); + sum_tmp11 = _mm_add_ps(tmp11, sum_tmp11); + sum_tmp12 = _mm_add_ps(tmp12, sum_tmp12); + sum_tmp20 = _mm_add_ps(tmp20, sum_tmp20); + sum_tmp21 = _mm_add_ps(tmp21, sum_tmp21); + sum_tmp22 = _mm_add_ps(tmp22, sum_tmp22); + sum_tmp30 = _mm_add_ps(tmp30, sum_tmp30); + sum_tmp31 = _mm_add_ps(tmp31, sum_tmp31); + sum_tmp32 = _mm_add_ps(tmp32, sum_tmp32); + + STORE_4(dst0 + 0, sum_tmp00); + STORE_4(dst0 + 8, sum_tmp01); + STORE_4(dst0 + 16, sum_tmp02); + STORE_4(dst1 + 0, sum_tmp10); + STORE_4(dst1 + 8, sum_tmp11); + STORE_4(dst1 + 16, sum_tmp12); + STORE_4(dst2 + 0, sum_tmp20); + STORE_4(dst2 + 8, sum_tmp21); + STORE_4(dst2 + 16, sum_tmp22); + STORE_4(dst3 + 0, sum_tmp30); + STORE_4(dst3 + 8, sum_tmp31); + STORE_4(dst3 + 16, sum_tmp32); + } } for (int y = hR; y < hC4; ++y) { @@ -1372,9 +1586,23 @@ static void _AVX_MNNPackedMatMul_int4_3(TYPE* C, const TYPE* A, const uint8_t* B z1 = MNNSSEFMA(s1, w0, z1); z2 = MNNSSEFMA(s2, w0, z2); } - STORE_4(dst + 8 * 0, z0); - STORE_4(dst + 8 * 1, z1); - STORE_4(dst + 8 * 2, z2); + if (0 == blockId) { + STORE_4(dst + 8 * 0, z0); + STORE_4(dst + 8 * 1, z1); + STORE_4(dst + 8 * 2, z2); + } else { + auto tmp0 = LOAD4(dst + 8 * 0); + auto tmp1 = LOAD4(dst + 8 * 1); + auto tmp2 = LOAD4(dst + 8 * 2); + + z0 = _mm_add_ps(tmp0, z0); + z1 = _mm_add_ps(tmp1, z1); + z2 = _mm_add_ps(tmp2, z2); + + STORE_4(dst + 8 * 0, z0); + STORE_4(dst + 8 * 1, z1); + STORE_4(dst + 8 * 2, z2); + } } } @@ -1386,7 +1614,8 @@ static void _AVX_MNNPackedMatMul_int4_2(TYPE* C, const TYPE* A, const uint8_t* B auto cStride = parameter[3] / sizeof(TYPE); float weightBytes = 0.5; auto bExtraStride = static_cast(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); int lC4 = l / 4; int lR = lC4 * 4; @@ -1426,17 +1655,55 @@ static void _AVX_MNNPackedMatMul_int4_2(TYPE* C, const TYPE* A, const uint8_t* B weight2 += 2; weight3 += 2; } - STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0)); - STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx10, 0)); - - STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1)); - STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx10, 1)); - - STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0)); - STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx11, 0)); - - STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1)); - STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx11, 1)); + if (0 == blockId) { + STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0)); + STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx10, 0)); + + STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1)); + STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx10, 1)); + + STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0)); + STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx11, 0)); + + STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1)); + STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx11, 1)); + } else { + auto tmp01 = LOAD4(dst0 + 0); + auto tmp02 = LOAD4(dst0 + 8); + auto tmp11 = LOAD4(dst1 + 0); + auto tmp12 = LOAD4(dst1 + 8); + auto tmp21 = LOAD4(dst2 + 0); + auto tmp22 = LOAD4(dst2 + 8); + auto tmp31 = LOAD4(dst3 + 0); + auto tmp32 = LOAD4(dst3 + 8); + + auto x_tmp01 = _mm256_extractf128_ps(sumAvx00, 0); + auto x_tmp02 = _mm256_extractf128_ps(sumAvx10, 0); + auto x_tmp11 = _mm256_extractf128_ps(sumAvx00, 1); + auto x_tmp12 = _mm256_extractf128_ps(sumAvx10, 1); + auto x_tmp21 = _mm256_extractf128_ps(sumAvx01, 0); + auto x_tmp22 = _mm256_extractf128_ps(sumAvx11, 0); + auto x_tmp31 = _mm256_extractf128_ps(sumAvx01, 1); + auto x_tmp32 = _mm256_extractf128_ps(sumAvx11, 1); + + x_tmp01 = _mm_add_ps(tmp01, x_tmp01); + x_tmp02 = _mm_add_ps(tmp02, x_tmp02); + x_tmp11 = _mm_add_ps(tmp11, x_tmp11); + x_tmp12 = _mm_add_ps(tmp12, x_tmp12); + x_tmp21 = _mm_add_ps(tmp21, x_tmp21); + x_tmp22 = _mm_add_ps(tmp22, x_tmp22); + x_tmp31 = _mm_add_ps(tmp31, x_tmp31); + x_tmp32 = _mm_add_ps(tmp32, x_tmp32); + + STORE_4(dst0 + 0, x_tmp01); + STORE_4(dst0 + 8, x_tmp02); + STORE_4(dst1 + 0, x_tmp11); + STORE_4(dst1 + 8, x_tmp12); + STORE_4(dst2 + 0, x_tmp21); + STORE_4(dst2 + 8, x_tmp22); + STORE_4(dst3 + 0, x_tmp31); + STORE_4(dst3 + 8, x_tmp32); + } } for (int y = hR; y < hC4; ++y) { @@ -1457,8 +1724,17 @@ static void _AVX_MNNPackedMatMul_int4_2(TYPE* C, const TYPE* A, const uint8_t* B z0 = MNNSSEFMA(s0, w0, z0); z1 = MNNSSEFMA(s1, w0, z1); } - STORE_4(dst + 8 * 0, z0); - STORE_4(dst + 8 * 1, z1); + if (0 == blockId) { + STORE_4(dst + 8 * 0, z0); + STORE_4(dst + 8 * 1, z1); + } else { + auto t0 = LOAD4(dst + 8 * 0); + auto t1 = LOAD4(dst + 8 * 1); + z0 = _mm_add_ps(z0, t0); + z1 = _mm_add_ps(z1, t1); + STORE_4(dst + 8 * 0, z0); + STORE_4(dst + 8 * 1, z1); + } } } @@ -1471,11 +1747,12 @@ static void _AVX_MNNPackednMatMulRemainCommon_int4(TYPE* C, const TYPE* A, const auto cStride = parameter[3] / sizeof(TYPE); float weightBytes = 0.5; // sizeof(int4_t) auto bExtraStride = static_cast(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; auto hC4 = UP_DIV(h, 4); auto es = eSize; auto oC = C; auto aStride = parameter[0] / sizeof(TYPE); + size_t blockId = parameter[6]; if (eSize >= 20) { _AVX_MNNPackedMatMul_int4_20(C, A, B, parameter, k, b); eSize -= 20; @@ -1597,10 +1874,25 @@ static void _AVX_MNNPackednMatMulRemainCommon_int4(TYPE* C, const TYPE* A, const sum3 = MNNSSEFMA(s, w3, sum3); srcUse += aStride; } - STORE_4(dst0, sum0); - STORE_4(dst1, sum1); - STORE_4(dst2, sum2); - STORE_4(dst3, sum3); + if (blockId == 0) { + STORE_4(dst0, sum0); + STORE_4(dst1, sum1); + STORE_4(dst2, sum2); + STORE_4(dst3, sum3); + } else { + auto tmp_0 = LOAD4(dst0); + auto tmp_1 = LOAD4(dst1); + auto tmp_2 = LOAD4(dst2); + auto tmp_3 = LOAD4(dst3); + sum0 = _mm_add_ps(tmp_0, sum0); + sum1 = _mm_add_ps(tmp_1, sum1); + sum2 = _mm_add_ps(tmp_2, sum2); + sum3 = _mm_add_ps(tmp_3, sum3); + STORE_4(dst0, sum0); + STORE_4(dst1, sum1); + STORE_4(dst2, sum2); + STORE_4(dst3, sum3); + } } for (int y = hR; y < hC4; ++y) { auto weight = B + y * bStride / 2; @@ -1636,7 +1928,13 @@ static void _AVX_MNNPackednMatMulRemainCommon_int4(TYPE* C, const TYPE* A, const sum = MNNSSEFMA(s, w, sum); srcUse += aStride; } - STORE_4(dst, sum); + if (blockId == 0) { + STORE_4(dst, sum); + } else { + auto tmp_0 = LOAD4(dst); + sum = _mm_add_ps(tmp_0, sum); + STORE_4(dst, sum); + } } } @@ -1684,9 +1982,10 @@ static void _AVX_MNNPackedMatMul_Main_int8(TYPE* C, const TYPE* A, const TYPE* f auto cStride = parameter[3] / sizeof(TYPE); int weightBytes = sizeof(int8_t); auto bExtraStride = parameter[5] / weightBytes; - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; auto hC4 = UP_DIV(h, 4); float ws_tmp[4]; + size_t blockId = parameter[6]; for (int y = 0; y < hC4; ++y) { auto weight = B + y * bStride; auto dst = C + (y / 2) * cStride + 4 * (y % 2); @@ -1737,12 +2036,21 @@ static void _AVX_MNNPackedMatMul_Main_int8(TYPE* C, const TYPE* A, const TYPE* f z10 = MNNAVXFMA(s1, w3, z10); z11 = MNNAVXFMA(s2, w3, z11); } - TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); - TRANPOSE_SAVE(1, 0, z0, z3, z6, z9); - TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); - TRANPOSE_SAVE(1, 1, z1, z4, z7, z10); - TRANPOSE_SAVE(0, 2, z2, z5, z8, z11); - TRANPOSE_SAVE(1, 2, z2, z5, z8, z11); + if (blockId == 0) { + TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(1, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + TRANPOSE_SAVE(1, 1, z1, z4, z7, z10); + TRANPOSE_SAVE(0, 2, z2, z5, z8, z11); + TRANPOSE_SAVE(1, 2, z2, z5, z8, z11); + } else { + FMLA_TRANSPOSE_SAVE(0, 0, z0, z3, z6, z9); + FMLA_TRANSPOSE_SAVE(1, 0, z0, z3, z6, z9); + FMLA_TRANSPOSE_SAVE(0, 1, z1, z4, z7, z10); + FMLA_TRANSPOSE_SAVE(1, 1, z1, z4, z7, z10); + FMLA_TRANSPOSE_SAVE(0, 2, z2, z5, z8, z11); + FMLA_TRANSPOSE_SAVE(1, 2, z2, z5, z8, z11); + } } } @@ -1755,7 +2063,8 @@ static void _AVX_MNNPackedMatMul_int8_20(TYPE* C, const TYPE* A, const int8_t* B auto cStride = parameter[3] / sizeof(TYPE); int weightBytes = sizeof(int8_t); auto bExtraStride = parameter[5] / weightBytes; - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); float ws_tmp[4]; for (int y = 0; y < hC4; ++y) { @@ -1807,11 +2116,19 @@ static void _AVX_MNNPackedMatMul_int8_20(TYPE* C, const TYPE* A, const int8_t* B z10 = MNNAVXFMA(s1, w3, z10); z11 = MNNAVXFMA(s2, w3, z11); } - TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); - TRANPOSE_SAVE(1, 0, z0, z3, z6, z9); - TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); - TRANPOSE_SAVE(1, 1, z1, z4, z7, z10); - TRANPOSE_SAVE(0, 2, z2, z5, z8, z11); + if (0 == blockId) { + TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(1, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + TRANPOSE_SAVE(1, 1, z1, z4, z7, z10); + TRANPOSE_SAVE(0, 2, z2, z5, z8, z11); + } else { + FMLA_TRANSPOSE_SAVE(0, 0, z0, z3, z6, z9); + FMLA_TRANSPOSE_SAVE(1, 0, z0, z3, z6, z9); + FMLA_TRANSPOSE_SAVE(0, 1, z1, z4, z7, z10); + FMLA_TRANSPOSE_SAVE(1, 1, z1, z4, z7, z10); + FMLA_TRANSPOSE_SAVE(0, 2, z2, z5, z8, z11); + } } } @@ -1822,7 +2139,8 @@ static void _AVX_MNNPackedMatMul_int8_16(TYPE* C, const TYPE* A, const int8_t* B auto l = parameter[1]; auto cStride = parameter[3] / sizeof(TYPE); auto bExtraStride = parameter[5] / sizeof(int8_t); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); float ws_tmp[4]; for (int y = 0; y < hC4; ++y) { @@ -1864,10 +2182,17 @@ static void _AVX_MNNPackedMatMul_int8_16(TYPE* C, const TYPE* A, const int8_t* B z9 = MNNAVXFMA(s0, w3, z9); z10 = MNNAVXFMA(s1, w3, z10); } - TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); - TRANPOSE_SAVE(1, 0, z0, z3, z6, z9); - TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); - TRANPOSE_SAVE(1, 1, z1, z4, z7, z10); + if (0 == blockId) { + TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(1, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + TRANPOSE_SAVE(1, 1, z1, z4, z7, z10); + } else { + FMLA_TRANSPOSE_SAVE(0, 0, z0, z3, z6, z9); + FMLA_TRANSPOSE_SAVE(1, 0, z0, z3, z6, z9); + FMLA_TRANSPOSE_SAVE(0, 1, z1, z4, z7, z10); + FMLA_TRANSPOSE_SAVE(1, 1, z1, z4, z7, z10); + } } } @@ -1878,7 +2203,8 @@ static void _AVX_MNNPackedMatMul_int8_5(TYPE* C, const TYPE* A, const int8_t* B, auto l = parameter[1]; auto cStride = parameter[3] / sizeof(TYPE); auto bExtraStride = parameter[5] / sizeof(int8_t); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); int lC4 = l / 4; int lR = lC4 * 4; @@ -1939,17 +2265,53 @@ static void _AVX_MNNPackedMatMul_int8_5(TYPE* C, const TYPE* A, const int8_t* B, weight2 += 4; weight3 += 4; } - STORE_8(dst0, sumAvx00); - STORE_8(dst0 + 8, sumAvx10); - STORE_8(dst0 + 16, sumAvx20); - STORE_8(dst0 + 24, sumAvx30); - STORE_8(dst0 + 32, sumAvx40); - - STORE_8(dst2, sumAvx01); - STORE_8(dst2 + 8, sumAvx11); - STORE_8(dst2 + 16, sumAvx21); - STORE_8(dst2 + 24, sumAvx31); - STORE_8(dst2 + 32, sumAvx41); + if (0 == blockId) { + STORE_8(dst0, sumAvx00); + STORE_8(dst0 + 8, sumAvx10); + STORE_8(dst0 + 16, sumAvx20); + STORE_8(dst0 + 24, sumAvx30); + STORE_8(dst0 + 32, sumAvx40); + + STORE_8(dst2, sumAvx01); + STORE_8(dst2 + 8, sumAvx11); + STORE_8(dst2 + 16, sumAvx21); + STORE_8(dst2 + 24, sumAvx31); + STORE_8(dst2 + 32, sumAvx41); + } else { + auto tmp0 = LOAD8(dst0); + auto tmp1 = LOAD8(dst0 + 8); + auto tmp2 = LOAD8(dst0 + 16); + auto tmp3 = LOAD8(dst0 + 24); + auto tmp4 = LOAD8(dst0 + 32); + auto tmp5 = LOAD8(dst2); + auto tmp6 = LOAD8(dst2 + 8); + auto tmp7 = LOAD8(dst2 + 16); + auto tmp8 = LOAD8(dst2 + 24); + auto tmp9 = LOAD8(dst2 + 32); + + sumAvx00 = _mm256_add_ps(sumAvx00, tmp0); + sumAvx10 = _mm256_add_ps(sumAvx10, tmp1); + sumAvx20 = _mm256_add_ps(sumAvx20, tmp2); + sumAvx30 = _mm256_add_ps(sumAvx30, tmp3); + sumAvx40 = _mm256_add_ps(sumAvx40, tmp4); + sumAvx01 = _mm256_add_ps(sumAvx01, tmp5); + sumAvx11 = _mm256_add_ps(sumAvx11, tmp6); + sumAvx21 = _mm256_add_ps(sumAvx21, tmp7); + sumAvx31 = _mm256_add_ps(sumAvx31, tmp8); + sumAvx41 = _mm256_add_ps(sumAvx41, tmp9); + + STORE_8(dst0, sumAvx00); + STORE_8(dst0 + 8, sumAvx10); + STORE_8(dst0 + 16, sumAvx20); + STORE_8(dst0 + 24, sumAvx30); + STORE_8(dst0 + 32, sumAvx40); + + STORE_8(dst2, sumAvx01); + STORE_8(dst2 + 8, sumAvx11); + STORE_8(dst2 + 16, sumAvx21); + STORE_8(dst2 + 24, sumAvx31); + STORE_8(dst2 + 32, sumAvx41); + } } for (int y = hR; y < hC4; ++y) { auto weight = B + y * bStride; @@ -1981,11 +2343,31 @@ static void _AVX_MNNPackedMatMul_int8_5(TYPE* C, const TYPE* A, const int8_t* B, z3 = MNNSSEFMA(s3, w0, z3); z4 = MNNSSEFMA(s4, w0, z4); } - STORE_4(dst + 8 * 0, z0); - STORE_4(dst + 8 * 1, z1); - STORE_4(dst + 8 * 2, z2); - STORE_4(dst + 8 * 3, z3); - STORE_4(dst + 8 * 4, z4); + if (0 == blockId) { + STORE_4(dst + 8 * 0, z0); + STORE_4(dst + 8 * 1, z1); + STORE_4(dst + 8 * 2, z2); + STORE_4(dst + 8 * 3, z3); + STORE_4(dst + 8 * 4, z4); + } else { + auto tmp0 = LOAD4(dst + 8 * 0); + auto tmp1 = LOAD4(dst + 8 * 1); + auto tmp2 = LOAD4(dst + 8 * 2); + auto tmp3 = LOAD4(dst + 8 * 3); + auto tmp4 = LOAD4(dst + 8 * 4); + + z0 = _mm_add_ps(tmp0, z0); + z1 = _mm_add_ps(tmp1, z1); + z2 = _mm_add_ps(tmp2, z2); + z3 = _mm_add_ps(tmp3, z3); + z4 = _mm_add_ps(tmp4, z4); + + STORE_4(dst + 8 * 0, z0); + STORE_4(dst + 8 * 1, z1); + STORE_4(dst + 8 * 2, z2); + STORE_4(dst + 8 * 3, z3); + STORE_4(dst + 8 * 4, z4); + } } } @@ -1997,7 +2379,8 @@ static void _AVX_MNNPackedMatMul_int8_4(TYPE* C, const TYPE* A, const int8_t* B, auto l = parameter[1]; auto cStride = parameter[3] / sizeof(TYPE); auto bExtraStride = parameter[5] / sizeof(int8_t); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); int lC4 = l / 4; int lR = lC4 * 4; @@ -2052,15 +2435,47 @@ static void _AVX_MNNPackedMatMul_int8_4(TYPE* C, const TYPE* A, const int8_t* B, weight2 += 4; weight3 += 4; } - STORE_8(dst0, sumAvx00); - STORE_8(dst0 + 8, sumAvx10); - STORE_8(dst0 + 16, sumAvx20); - STORE_8(dst0 + 24, sumAvx30); - - STORE_8(dst2, sumAvx01); - STORE_8(dst2 + 8, sumAvx11); - STORE_8(dst2 + 16, sumAvx21); - STORE_8(dst2 + 24, sumAvx31); + if (0 == blockId) { + STORE_8(dst0, sumAvx00); + STORE_8(dst0 + 8, sumAvx10); + STORE_8(dst0 + 16, sumAvx20); + STORE_8(dst0 + 24, sumAvx30); + + STORE_8(dst2, sumAvx01); + STORE_8(dst2 + 8, sumAvx11); + STORE_8(dst2 + 16, sumAvx21); + STORE_8(dst2 + 24, sumAvx31); + } else { + auto tmp0 = LOAD8(dst0); + auto tmp1 = LOAD8(dst0 + 8); + auto tmp2 = LOAD8(dst0 + 16); + auto tmp3 = LOAD8(dst0 + 24); + + auto tmp5 = LOAD8(dst2); + auto tmp6 = LOAD8(dst2 + 8); + auto tmp7 = LOAD8(dst2 + 16); + auto tmp8 = LOAD8(dst2 + 24); + + sumAvx00 = _mm256_add_ps(sumAvx00, tmp0); + sumAvx10 = _mm256_add_ps(sumAvx10, tmp1); + sumAvx20 = _mm256_add_ps(sumAvx20, tmp2); + sumAvx30 = _mm256_add_ps(sumAvx30, tmp3); + + sumAvx01 = _mm256_add_ps(sumAvx01, tmp5); + sumAvx11 = _mm256_add_ps(sumAvx11, tmp6); + sumAvx21 = _mm256_add_ps(sumAvx21, tmp7); + sumAvx31 = _mm256_add_ps(sumAvx31, tmp8); + + STORE_8(dst0, sumAvx00); + STORE_8(dst0 + 8, sumAvx10); + STORE_8(dst0 + 16, sumAvx20); + STORE_8(dst0 + 24, sumAvx30); + + STORE_8(dst2, sumAvx01); + STORE_8(dst2 + 8, sumAvx11); + STORE_8(dst2 + 16, sumAvx21); + STORE_8(dst2 + 24, sumAvx31); + } } float ws_tmp[4]; for (int y = hR; y < hC4; ++y) { @@ -2094,10 +2509,28 @@ static void _AVX_MNNPackedMatMul_int8_4(TYPE* C, const TYPE* A, const int8_t* B, z9 = MNNSSEFMA(s0, w3, z9); } _MM_TRANSPOSE4_PS(z0, z3, z6, z9); - STORE_4(dst + 8 * 0, z0); - STORE_4(dst + 8 * 1, z3); - STORE_4(dst + 8 * 2, z6); - STORE_4(dst + 8 * 3, z9); + if (0 == blockId) { + STORE_4(dst + 8 * 0, z0); + STORE_4(dst + 8 * 1, z3); + STORE_4(dst + 8 * 2, z6); + STORE_4(dst + 8 * 3, z9); + } else { + + auto tmp0 = LOAD4(dst + 8 * 0); + auto tmp1 = LOAD4(dst + 8 * 1); + auto tmp2 = LOAD4(dst + 8 * 2); + auto tmp3 = LOAD4(dst + 8 * 3); + + z0 = _mm_add_ps(tmp0, z0); + z3 = _mm_add_ps(tmp1, z3); + z6 = _mm_add_ps(tmp2, z6); + z9 = _mm_add_ps(tmp3, z9); + + STORE_4(dst + 8 * 0, z0); + STORE_4(dst + 8 * 1, z3); + STORE_4(dst + 8 * 2, z6); + STORE_4(dst + 8 * 3, z9); + } } } template @@ -2107,7 +2540,8 @@ static void _AVX_MNNPackedMatMul_int8_3(TYPE* C, const TYPE* A, const int8_t* B, auto l = parameter[1]; auto cStride = parameter[3] / sizeof(TYPE); auto bExtraStride = parameter[5] / sizeof(int8_t); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); int lC4 = l / 4; int lR = lC4 * 4; @@ -2155,21 +2589,78 @@ static void _AVX_MNNPackedMatMul_int8_3(TYPE* C, const TYPE* A, const int8_t* B, weight2 += 4; weight3 += 4; } - STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0)); - STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx10, 0)); - STORE_4(dst0 + 16, _mm256_extractf128_ps(sumAvx20, 0)); - - STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1)); - STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx10, 1)); - STORE_4(dst1 + 16, _mm256_extractf128_ps(sumAvx20, 1)); - - STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0)); - STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx11, 0)); - STORE_4(dst2 + 16, _mm256_extractf128_ps(sumAvx21, 0)); - - STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1)); - STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx11, 1)); - STORE_4(dst3 + 16, _mm256_extractf128_ps(sumAvx21, 1)); + if (0 == blockId) { + STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0)); + STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx10, 0)); + STORE_4(dst0 + 16, _mm256_extractf128_ps(sumAvx20, 0)); + + STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1)); + STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx10, 1)); + STORE_4(dst1 + 16, _mm256_extractf128_ps(sumAvx20, 1)); + + STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0)); + STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx11, 0)); + STORE_4(dst2 + 16, _mm256_extractf128_ps(sumAvx21, 0)); + + STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1)); + STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx11, 1)); + STORE_4(dst3 + 16, _mm256_extractf128_ps(sumAvx21, 1)); + } else { + auto tmp00 = LOAD4(dst0 + 0); + auto tmp01 = LOAD4(dst0 + 8); + auto tmp02 = LOAD4(dst0 + 16); + + auto tmp10 = LOAD4(dst1 + 0); + auto tmp11 = LOAD4(dst1 + 8); + auto tmp12 = LOAD4(dst1 + 16); + + auto tmp20 = LOAD4(dst2 + 0); + auto tmp21 = LOAD4(dst2 + 8); + auto tmp22 = LOAD4(dst2 + 16); + + auto tmp30 = LOAD4(dst3 + 0); + auto tmp31 = LOAD4(dst3 + 8); + auto tmp32 = LOAD4(dst3 + 16); + + auto sum_tmp00 = _mm256_extractf128_ps(sumAvx00, 0); + auto sum_tmp01 = _mm256_extractf128_ps(sumAvx10, 0); + auto sum_tmp02 = _mm256_extractf128_ps(sumAvx20, 0); + auto sum_tmp10 = _mm256_extractf128_ps(sumAvx00, 1); + auto sum_tmp11 = _mm256_extractf128_ps(sumAvx10, 1); + auto sum_tmp12 = _mm256_extractf128_ps(sumAvx20, 1); + auto sum_tmp20 = _mm256_extractf128_ps(sumAvx01, 0); + auto sum_tmp21 = _mm256_extractf128_ps(sumAvx11, 0); + auto sum_tmp22 = _mm256_extractf128_ps(sumAvx21, 0); + auto sum_tmp30 = _mm256_extractf128_ps(sumAvx01, 1); + auto sum_tmp31 = _mm256_extractf128_ps(sumAvx11, 1); + auto sum_tmp32 = _mm256_extractf128_ps(sumAvx21, 1); + + sum_tmp00 = _mm_add_ps(tmp00, sum_tmp00); + sum_tmp01 = _mm_add_ps(tmp01, sum_tmp01); + sum_tmp02 = _mm_add_ps(tmp02, sum_tmp02); + sum_tmp10 = _mm_add_ps(tmp10, sum_tmp10); + sum_tmp11 = _mm_add_ps(tmp11, sum_tmp11); + sum_tmp12 = _mm_add_ps(tmp12, sum_tmp12); + sum_tmp20 = _mm_add_ps(tmp20, sum_tmp20); + sum_tmp21 = _mm_add_ps(tmp21, sum_tmp21); + sum_tmp22 = _mm_add_ps(tmp22, sum_tmp22); + sum_tmp30 = _mm_add_ps(tmp30, sum_tmp30); + sum_tmp31 = _mm_add_ps(tmp31, sum_tmp31); + sum_tmp32 = _mm_add_ps(tmp32, sum_tmp32); + + STORE_4(dst0 + 0, sum_tmp00); + STORE_4(dst0 + 8, sum_tmp01); + STORE_4(dst0 + 16, sum_tmp02); + STORE_4(dst1 + 0, sum_tmp10); + STORE_4(dst1 + 8, sum_tmp11); + STORE_4(dst1 + 16, sum_tmp12); + STORE_4(dst2 + 0, sum_tmp20); + STORE_4(dst2 + 8, sum_tmp21); + STORE_4(dst2 + 16, sum_tmp22); + STORE_4(dst3 + 0, sum_tmp30); + STORE_4(dst3 + 8, sum_tmp31); + STORE_4(dst3 + 16, sum_tmp32); + } } for (int y = hR; y < hC4; ++y) { @@ -2194,9 +2685,23 @@ static void _AVX_MNNPackedMatMul_int8_3(TYPE* C, const TYPE* A, const int8_t* B, z1 = MNNSSEFMA(s1, w0, z1); z2 = MNNSSEFMA(s2, w0, z2); } - STORE_4(dst + 8 * 0, z0); - STORE_4(dst + 8 * 1, z1); - STORE_4(dst + 8 * 2, z2); + if (0 == blockId) { + STORE_4(dst + 8 * 0, z0); + STORE_4(dst + 8 * 1, z1); + STORE_4(dst + 8 * 2, z2); + } else { + auto tmp0 = LOAD4(dst + 8 * 0); + auto tmp1 = LOAD4(dst + 8 * 1); + auto tmp2 = LOAD4(dst + 8 * 2); + + z0 = _mm_add_ps(tmp0, z0); + z1 = _mm_add_ps(tmp1, z1); + z2 = _mm_add_ps(tmp2, z2); + + STORE_4(dst + 8 * 0, z0); + STORE_4(dst + 8 * 1, z1); + STORE_4(dst + 8 * 2, z2); + } } } @@ -2207,7 +2712,8 @@ static void _AVX_MNNPackedMatMul_int8_2(TYPE* C, const TYPE* A, const int8_t* B, auto l = parameter[1]; auto cStride = parameter[3] / sizeof(TYPE); auto bExtraStride = parameter[5] / sizeof(int8_t); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); int lC4 = l / 4; int lR = lC4 * 4; @@ -2247,17 +2753,55 @@ static void _AVX_MNNPackedMatMul_int8_2(TYPE* C, const TYPE* A, const int8_t* B, weight2 += 4; weight3 += 4; } - STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0)); - STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx10, 0)); - - STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1)); - STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx10, 1)); - - STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0)); - STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx11, 0)); - - STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1)); - STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx11, 1)); + if (0 == blockId) { + STORE_4(dst0 + 0, _mm256_extractf128_ps(sumAvx00, 0)); + STORE_4(dst0 + 8, _mm256_extractf128_ps(sumAvx10, 0)); + + STORE_4(dst1 + 0, _mm256_extractf128_ps(sumAvx00, 1)); + STORE_4(dst1 + 8, _mm256_extractf128_ps(sumAvx10, 1)); + + STORE_4(dst2 + 0, _mm256_extractf128_ps(sumAvx01, 0)); + STORE_4(dst2 + 8, _mm256_extractf128_ps(sumAvx11, 0)); + + STORE_4(dst3 + 0, _mm256_extractf128_ps(sumAvx01, 1)); + STORE_4(dst3 + 8, _mm256_extractf128_ps(sumAvx11, 1)); + } else { + auto tmp01 = LOAD4(dst0 + 0); + auto tmp02 = LOAD4(dst0 + 8); + auto tmp11 = LOAD4(dst1 + 0); + auto tmp12 = LOAD4(dst1 + 8); + auto tmp21 = LOAD4(dst2 + 0); + auto tmp22 = LOAD4(dst2 + 8); + auto tmp31 = LOAD4(dst3 + 0); + auto tmp32 = LOAD4(dst3 + 8); + + auto x_tmp01 = _mm256_extractf128_ps(sumAvx00, 0); + auto x_tmp02 = _mm256_extractf128_ps(sumAvx10, 0); + auto x_tmp11 = _mm256_extractf128_ps(sumAvx00, 1); + auto x_tmp12 = _mm256_extractf128_ps(sumAvx10, 1); + auto x_tmp21 = _mm256_extractf128_ps(sumAvx01, 0); + auto x_tmp22 = _mm256_extractf128_ps(sumAvx11, 0); + auto x_tmp31 = _mm256_extractf128_ps(sumAvx01, 1); + auto x_tmp32 = _mm256_extractf128_ps(sumAvx11, 1); + + x_tmp01 = _mm_add_ps(tmp01, x_tmp01); + x_tmp02 = _mm_add_ps(tmp02, x_tmp02); + x_tmp11 = _mm_add_ps(tmp11, x_tmp11); + x_tmp12 = _mm_add_ps(tmp12, x_tmp12); + x_tmp21 = _mm_add_ps(tmp21, x_tmp21); + x_tmp22 = _mm_add_ps(tmp22, x_tmp22); + x_tmp31 = _mm_add_ps(tmp31, x_tmp31); + x_tmp32 = _mm_add_ps(tmp32, x_tmp32); + + STORE_4(dst0 + 0, x_tmp01); + STORE_4(dst0 + 8, x_tmp02); + STORE_4(dst1 + 0, x_tmp11); + STORE_4(dst1 + 8, x_tmp12); + STORE_4(dst2 + 0, x_tmp21); + STORE_4(dst2 + 8, x_tmp22); + STORE_4(dst3 + 0, x_tmp31); + STORE_4(dst3 + 8, x_tmp32); + } } for (int y = hR; y < hC4; ++y) { @@ -2278,8 +2822,17 @@ static void _AVX_MNNPackedMatMul_int8_2(TYPE* C, const TYPE* A, const int8_t* B, z0 = MNNSSEFMA(s0, w0, z0); z1 = MNNSSEFMA(s1, w0, z1); } - STORE_4(dst + 8 * 0, z0); - STORE_4(dst + 8 * 1, z1); + if (0 == blockId) { + STORE_4(dst + 8 * 0, z0); + STORE_4(dst + 8 * 1, z1); + } else { + auto t0 = LOAD4(dst + 8 * 0); + auto t1 = LOAD4(dst + 8 * 1); + z0 = _mm_add_ps(z0, t0); + z1 = _mm_add_ps(z1, t1); + STORE_4(dst + 8 * 0, z0); + STORE_4(dst + 8 * 1, z1); + } } } @@ -2291,7 +2844,8 @@ static void _AVX_MNNPackednMatMulRemainCommon_int8(TYPE* C, const TYPE* A, const auto l = parameter[1]; auto cStride = parameter[3] / sizeof(TYPE); auto bExtraStride = parameter[5] / sizeof(int8_t); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); auto es = eSize; auto oC = C; @@ -2417,10 +2971,25 @@ static void _AVX_MNNPackednMatMulRemainCommon_int8(TYPE* C, const TYPE* A, const sum3 = MNNSSEFMA(s, w3, sum3); srcUse += aStride; } - STORE_4(dst0, sum0); - STORE_4(dst1, sum1); - STORE_4(dst2, sum2); - STORE_4(dst3, sum3); + if (blockId == 0) { + STORE_4(dst0, sum0); + STORE_4(dst1, sum1); + STORE_4(dst2, sum2); + STORE_4(dst3, sum3); + } else { + auto tmp_0 = LOAD4(dst0); + auto tmp_1 = LOAD4(dst1); + auto tmp_2 = LOAD4(dst2); + auto tmp_3 = LOAD4(dst3); + sum0 = _mm_add_ps(tmp_0, sum0); + sum1 = _mm_add_ps(tmp_1, sum1); + sum2 = _mm_add_ps(tmp_2, sum2); + sum3 = _mm_add_ps(tmp_3, sum3); + STORE_4(dst0, sum0); + STORE_4(dst1, sum1); + STORE_4(dst2, sum2); + STORE_4(dst3, sum3); + } } for (int y = hR; y < hC4; ++y) { auto weight = B + y * bStride; @@ -2456,7 +3025,14 @@ static void _AVX_MNNPackednMatMulRemainCommon_int8(TYPE* C, const TYPE* A, const sum = MNNSSEFMA(s, w, sum); srcUse += aStride; } - STORE_4(dst, sum); + if (blockId == 0) { + STORE_4(dst, sum); + } else { + auto tmp_0 = LOAD4(dst); + sum = _mm_add_ps(tmp_0, sum); + STORE_4(dst, sum); + + } } } diff --git a/source/backend/cpu/x86_x64/sse/GemmCommon.hpp b/source/backend/cpu/x86_x64/sse/GemmCommon.hpp index 007a55c7c..8d555e95c 100644 --- a/source/backend/cpu/x86_x64/sse/GemmCommon.hpp +++ b/source/backend/cpu/x86_x64/sse/GemmCommon.hpp @@ -24,6 +24,27 @@ _mm_storeu_ps(dst + 4 * (3 + 4 * v), m3); \ } +#define FMLA_TRANPOSE_SAVE(u, v, z0, z3, z6, z9) \ + { \ + auto m0 = z0; \ + auto m1 = z3; \ + auto m2 = z6; \ + auto m3 = z9; \ + _MM_TRANSPOSE4_PS(m0, m1, m2, m3); \ + auto t0 = _mm_loadu_ps(dst + 4 * (0 + 4 * v));\ + auto t1 = _mm_loadu_ps(dst + 4 * (1 + 4 * v));\ + auto t2 = _mm_loadu_ps(dst + 4 * (2 + 4 * v));\ + auto t3 = _mm_loadu_ps(dst + 4 * (3 + 4 * v));\ + m0 = _mm_add_ps(m0, t0);\ + m1 = _mm_add_ps(m1, t1);\ + m2 = _mm_add_ps(m2, t2);\ + m3 = _mm_add_ps(m3, t3);\ + _mm_storeu_ps(dst + 4 * (0 + 4 * v), m0); \ + _mm_storeu_ps(dst + 4 * (1 + 4 * v), m1); \ + _mm_storeu_ps(dst + 4 * (2 + 4 * v), m2); \ + _mm_storeu_ps(dst + 4 * (3 + 4 * v), m3); \ + } + void _SSE_GemmPostTreat(float* C, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias); #endif diff --git a/source/backend/cpu/x86_x64/sse/GemmFunction.hpp b/source/backend/cpu/x86_x64/sse/GemmFunction.hpp index 57c2558b0..e0272c184 100644 --- a/source/backend/cpu/x86_x64/sse/GemmFunction.hpp +++ b/source/backend/cpu/x86_x64/sse/GemmFunction.hpp @@ -224,8 +224,9 @@ static void _SSE_MNNPackedMatMul_12_int4(float* C, const float* A, const float* auto cStride = parameter[3] / sizeof(float); float weightBytes = 0.5; // sizeof(int4_t) auto bExtraStride = static_cast(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; auto hC4 = UP_DIV(h, 4); + auto blockId = parameter[6]; float ws_tmp[4]; for (int y = 0; y < hC4; ++y) { auto weight = B + y * bStride / 2; @@ -277,9 +278,15 @@ static void _SSE_MNNPackedMatMul_12_int4(float* C, const float* A, const float* z10 = MNNSSEFMA(s1, w3, z10); z11 = MNNSSEFMA(s2, w3, z11); } - TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); - TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); - TRANPOSE_SAVE(0, 2, z2, z5, z8, z11); + if (0 == blockId) { + TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + TRANPOSE_SAVE(0, 2, z2, z5, z8, z11); + } else { + FMLA_TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); + FMLA_TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + FMLA_TRANPOSE_SAVE(0, 2, z2, z5, z8, z11); + } } } @@ -290,7 +297,8 @@ static void _SSE_MNNPackedMatMul_8_int4(float* C, const float* A, const uint8_t* auto cStride = parameter[3] / sizeof(float); float weightBytes = 0.5; // sizeof(int4_t) auto bExtraStride = static_cast(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); float ws_tmp[4]; for (int y = 0; y < hC4; ++y) { @@ -333,8 +341,13 @@ static void _SSE_MNNPackedMatMul_8_int4(float* C, const float* A, const uint8_t* z7 = MNNSSEFMA(s1, w2, z7); z10 = MNNSSEFMA(s1, w3, z10); } - TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); - TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + if (0 == blockId) { + TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + } else { + FMLA_TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); + FMLA_TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + } } } @@ -345,7 +358,8 @@ static void _SSE_MNNPackedMatMul_4_int4(float* C, const float* A, const uint8_t* auto cStride = parameter[3] / sizeof(float); float weightBytes = 0.5; auto bExtraStride = static_cast(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); float ws_tmp[4]; for (int y = 0; y < hC4; ++y) { @@ -379,10 +393,26 @@ static void _SSE_MNNPackedMatMul_4_int4(float* C, const float* A, const uint8_t* z9 = MNNSSEFMA(s0, w3, z9); } _MM_TRANSPOSE4_PS(z0, z3, z6, z9); - _mm_storeu_ps(dst + 4 * 0, z0); - _mm_storeu_ps(dst + 4 * 1, z3); - _mm_storeu_ps(dst + 4 * 2, z6); - _mm_storeu_ps(dst + 4 * 3, z9); + if (0 == blockId) { + _mm_storeu_ps(dst + 4 * 0, z0); + _mm_storeu_ps(dst + 4 * 1, z3); + _mm_storeu_ps(dst + 4 * 2, z6); + _mm_storeu_ps(dst + 4 * 3, z9); + } else { + auto t0 = _mm_loadu_ps(dst + 4 * 0); + auto t1 = _mm_loadu_ps(dst + 4 * 1); + auto t2 = _mm_loadu_ps(dst + 4 * 2); + auto t3 = _mm_loadu_ps(dst + 4 * 3); + + z0 = _mm_add_ps(z0, t0); + z3 = _mm_add_ps(z3, t1); + z6 = _mm_add_ps(z6, t2); + z9 = _mm_add_ps(z9, t3); + _mm_storeu_ps(dst + 4 * 0, z0); + _mm_storeu_ps(dst + 4 * 1, z3); + _mm_storeu_ps(dst + 4 * 2, z6); + _mm_storeu_ps(dst + 4 * 3, z9); + } } } @@ -394,7 +424,8 @@ static void _SSE_MNNPackednMatMulRemainCommon_int4(float* C, const float* A, con auto cStride = parameter[3] / sizeof(float); float weightBytes = 0.5; // sizeof(int4_t) auto bExtraStride = static_cast(parameter[5] / weightBytes); // parameter[5]/weightBytes - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); auto es = eSize; auto oC = C; @@ -424,7 +455,15 @@ static void _SSE_MNNPackednMatMulRemainCommon_int4(float* C, const float* A, con auto w = _load_int4x4(weight + sy * 2, alpha, bias); sum = MNNSSEFMA(s, w, sum); } - _mm_storeu_ps(dst, sum); + if (0 == blockId) { + _mm_storeu_ps(dst, sum); + } else { + auto tmp = _mm_loadu_ps(dst); + sum = _mm_add_ps(sum, tmp); + _mm_storeu_ps(dst, sum); + } + + } } } @@ -446,7 +485,8 @@ static void _SSE_MNNPackedMatMul_12_int8(float* C, const float* A, const float* auto cStride = parameter[3] / sizeof(float); float weightBytes = 1; // sizeof(int8_t) auto bExtraStride = static_cast(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); float ws_tmp[4]; for (int y = 0; y < hC4; ++y) { @@ -499,9 +539,16 @@ static void _SSE_MNNPackedMatMul_12_int8(float* C, const float* A, const float* z10 = MNNSSEFMA(s1, w3, z10); z11 = MNNSSEFMA(s2, w3, z11); } - TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); - TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); - TRANPOSE_SAVE(0, 2, z2, z5, z8, z11); + if (0 == blockId) { + TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + TRANPOSE_SAVE(0, 2, z2, z5, z8, z11); + } else { + FMLA_TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); + FMLA_TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + FMLA_TRANPOSE_SAVE(0, 2, z2, z5, z8, z11); + } + } } @@ -512,7 +559,8 @@ static void _SSE_MNNPackedMatMul_8_int8(float* C, const float* A, const int8_t* auto cStride = parameter[3] / sizeof(float); float weightBytes = 1; // sizeof(int8_t) auto bExtraStride = static_cast(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); float ws_tmp[4]; for (int y = 0; y < hC4; ++y) { @@ -555,8 +603,14 @@ static void _SSE_MNNPackedMatMul_8_int8(float* C, const float* A, const int8_t* z7 = MNNSSEFMA(s1, w2, z7); z10 = MNNSSEFMA(s1, w3, z10); } - TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); - TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + if (0 == blockId) { + TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); + TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + } else { + FMLA_TRANPOSE_SAVE(0, 0, z0, z3, z6, z9); + FMLA_TRANPOSE_SAVE(0, 1, z1, z4, z7, z10); + } + } } @@ -567,7 +621,8 @@ static void _SSE_MNNPackedMatMul_4_int8(float* C, const float* A, const int8_t* auto cStride = parameter[3] / sizeof(float); float weightBytes = 1; // sizeof(int8_t) auto bExtraStride = static_cast(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; + auto blockId = parameter[6]; auto hC4 = UP_DIV(h, 4); float ws_tmp[4]; for (int y = 0; y < hC4; ++y) { @@ -601,10 +656,27 @@ static void _SSE_MNNPackedMatMul_4_int8(float* C, const float* A, const int8_t* z9 = MNNSSEFMA(s0, w3, z9); } _MM_TRANSPOSE4_PS(z0, z3, z6, z9); - _mm_storeu_ps(dst + 4 * 0, z0); - _mm_storeu_ps(dst + 4 * 1, z3); - _mm_storeu_ps(dst + 4 * 2, z6); - _mm_storeu_ps(dst + 4 * 3, z9); + if (0 == blockId) { + _mm_storeu_ps(dst + 4 * 0, z0); + _mm_storeu_ps(dst + 4 * 1, z3); + _mm_storeu_ps(dst + 4 * 2, z6); + _mm_storeu_ps(dst + 4 * 3, z9); + } else { + auto t0 = _mm_loadu_ps(dst + 4 * 0); + auto t1 = _mm_loadu_ps(dst + 4 * 1); + auto t2 = _mm_loadu_ps(dst + 4 * 2); + auto t3 = _mm_loadu_ps(dst + 4 * 3); + + z0 = _mm_add_ps(t0, z0); + z3 = _mm_add_ps(t1, z3); + z6 = _mm_add_ps(t2, z6); + z9 = _mm_add_ps(t3, z9); + + _mm_storeu_ps(dst + 4 * 0, z0); + _mm_storeu_ps(dst + 4 * 1, z3); + _mm_storeu_ps(dst + 4 * 2, z6); + _mm_storeu_ps(dst + 4 * 3, z9); + } } } @@ -616,11 +688,12 @@ static void _SSE_MNNPackednMatMulRemainCommon_int8(float* C, const float* A, con auto cStride = parameter[3] / sizeof(float); float weightBytes = 1; // sizeof(int8_t) auto bExtraStride = static_cast(parameter[5] / weightBytes); - auto bStride = bExtraStride + l * 4; + auto bStride = bExtraStride + 4 * l; auto hC4 = UP_DIV(h, 4); auto es = eSize; auto oC = C; auto aStride = parameter[0] / sizeof(float); + auto blockId = parameter[6]; if (eSize >= 8) { _SSE_MNNPackedMatMul_8_int8(C, A, B, parameter, k, b); eSize -= 8; @@ -646,7 +719,13 @@ static void _SSE_MNNPackednMatMulRemainCommon_int8(float* C, const float* A, con auto w = _load_int8x4(weight + sy * 4, alpha, bias); sum = MNNSSEFMA(s, w, sum); } - _mm_storeu_ps(dst, sum); + if (blockId == 0) { + _mm_storeu_ps(dst, sum); + } else { + auto t = _mm_loadu_ps(dst); + sum = _mm_add_ps(sum, t); + _mm_storeu_ps(dst, sum); + } } } } diff --git a/source/backend/cpu/x86_x64/sse/GemmSSE.cpp b/source/backend/cpu/x86_x64/sse/GemmSSE.cpp index accd3f8dd..7d5699e96 100644 --- a/source/backend/cpu/x86_x64/sse/GemmSSE.cpp +++ b/source/backend/cpu/x86_x64/sse/GemmSSE.cpp @@ -35,13 +35,17 @@ void _SSE_MNNPackedMatMul_int4(float* C, const float* A, const float* B, const s auto hC4 = UP_DIV(h, 4); auto cStride = parameter[3] / sizeof(float); _SSE_MNNPackedMatMul_12_int4(C, A, B, parameter, k, b); - _SSE_GemmPostTreat(C, 12, parameter, postParameters, bias); + if (nullptr != bias) { + _SSE_GemmPostTreat(C, 12, parameter, postParameters, bias); + } } void _SSE_MNNPackedMatMulRemain_int4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b) { _SSE_MNNPackednMatMulRemainCommon_int4(C, A, B, eSize, parameter, postParameters, bias, k, b); - _SSE_GemmPostTreat(C, eSize, parameter, postParameters, bias); + if (nullptr != bias) { + _SSE_GemmPostTreat(C, eSize, parameter, postParameters, bias); + } } void _SSE_MNNPackedMatMul_int8(float* C, const float* A, const float* B, const size_t* parameter, @@ -50,13 +54,17 @@ void _SSE_MNNPackedMatMul_int8(float* C, const float* A, const float* B, const s auto hC4 = UP_DIV(h, 4); auto cStride = parameter[3] / sizeof(float); _SSE_MNNPackedMatMul_12_int8(C, A, B, parameter, k, b); - _SSE_GemmPostTreat(C, 12, parameter, postParameters, bias); + if (nullptr != bias) { + _SSE_GemmPostTreat(C, 12, parameter, postParameters, bias); + } } void _SSE_MNNPackedMatMulRemain_int8(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b) { _SSE_MNNPackednMatMulRemainCommon_int8(C, A, B, eSize, parameter, postParameters, bias, k, b); - _SSE_GemmPostTreat(C, eSize, parameter, postParameters, bias); + if (nullptr != bias) { + _SSE_GemmPostTreat(C, eSize, parameter, postParameters, bias); + } } void _SSE_MNNGemmHybridInt4(float* C, const int8_t* A, const int8_t* B, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, size_t realSize, const float** param) { diff --git a/source/backend/metal/AllShader.cpp b/source/backend/metal/AllShader.cpp index 620e6ce99..2fb7cacc2 100644 --- a/source/backend/metal/AllShader.cpp +++ b/source/backend/metal/AllShader.cpp @@ -1659,6 +1659,7 @@ const char* shader_MetalConvolution1x1_metal = " int output_slice;\n" " int output_channel;\n" " int batch;\n" +" int block_size;\n" " conv_activation_type activation;\n" "};\n" "kernel void conv1x1_g1z4(const device M4 *in [[buffer(0)]],\n" @@ -1712,29 +1713,32 @@ const char* shader_MetalConvolution1x1_metal = " auto biasValue=FLOAT4(biasTerms[uz]);\n" " FLOAT4 result0=biasValue,result1=biasValue,result2=biasValue,result3=biasValue;\n" " int computeSize=min(cst.output_size-rx,CONV_UNROLL);\n" -" auto scale=FLOAT4(dequantScale[uz]);\n" -" auto dequant_bias=FLOAT4(dequantScale[uz+cst.output_slice]);\n" -" for (auto z=0; zfloat */\n" " FLOAT4x4 w_fp32=FLOAT4x4(FLOAT4(w[0]),FLOAT4(w[1]),FLOAT4(w[2]),FLOAT4(w[3]));\n" " FLOAT4x4 w_dequant;\n" " for (int i=0; i<4; ++i) {\n" " w_dequant[i]=w_fp32[i]*scale[i]+dequant_bias[i];\n" " }\n" -" \n" " result0 += FLOAT4(in40*w_dequant);\n" " result1 += FLOAT4(in41*w_dequant);\n" " result2 += FLOAT4(in42*w_dequant);\n" " result3 += FLOAT4(in43*w_dequant);\n" " xy_in0 += cst.input_size*cst.batch;\n" " }\n" -" \n" +" }\n" " /* true */ \n" " xy_out[0]=activate(M4(result0),cst.activation);\n" " if (computeSize>1) {xy_out[1]=activate(M4(result1),cst.activation); }\n" @@ -1757,9 +1761,13 @@ const char* shader_MetalConvolution1x1_metal = " auto biasValue=FLOAT4(biasTerms[uz]);\n" " FLOAT4 result0=biasValue,result1=biasValue,result2=biasValue,result3=biasValue;\n" " int computeSize=min(cst.output_size-rx,CONV_UNROLL);\n" -" auto scale=FLOAT4(dequantScale[uz]);\n" -" auto dequant_bias=FLOAT4(dequantScale[uz+cst.output_slice]);\n" -" for (auto z=0; z) diff --git a/source/backend/metal/MetalAttention.mm b/source/backend/metal/MetalAttention.mm index 5bdba5fff..2c6eed591 100644 --- a/source/backend/metal/MetalAttention.mm +++ b/source/backend/metal/MetalAttention.mm @@ -24,6 +24,7 @@ int query_seq_len; int key_seq_len; int head_num; + int group; int head_dim; float scale; }; @@ -45,18 +46,21 @@ kernel void main0(const device T* input0 [[buffer(0)]], if (x >= param.query_seq_len || y >= param.head_num || z >= param.key_seq_len) { return; } + int group = param.group; int query_seq_len = param.query_seq_len; int key_seq_len = param.key_seq_len; int head_num = param.head_num; int head_dim = param.head_dim; + int yr = y % param.group; const int offset = head_num * head_dim; const int offset_head = y * head_dim; + const int offset_head_kv = (y / param.group) * head_dim; const device T* A_offset = input0 + x * offset + offset_head; - device T* Pastkey_offset = past_key + z * offset + offset_head; + device T* Pastkey_offset = past_key + z * offset / group + offset_head_kv; float Vscale = (float)param.scale; #ifdef FOR_PREFILL - device const T* B_offset = input1 + z * offset + offset_head; + device const T* B_offset = input1 + z * offset / group + offset_head_kv; const int output_offset = y * query_seq_len * key_seq_len; float out0 = 0.0; @@ -64,7 +68,9 @@ kernel void main0(const device T* input0 [[buffer(0)]], float A = (float)(A_offset[i]); float B = (float)(B_offset[i]); out0 += B * A; - Pastkey_offset[i] = (T)B; + if (yr == 0) { + Pastkey_offset[i] = (T)B; + } } out0 *= Vscale; @@ -76,14 +82,16 @@ kernel void main0(const device T* input0 [[buffer(0)]], #endif output[output_offset + x * key_seq_len + z] = (T)out0; #else - const device T *B_offset = input1 + offset_head; + const device T *B_offset = input1 + offset_head_kv; float out = 0.0; if (z == key_seq_len - 1) { for(int i = 0; i < head_dim; ++i){ float A = (float)(A_offset[i]); float B = (float)(B_offset[i]); out += B * A; - Pastkey_offset[i] = (T)B; + if (yr == 0) { + Pastkey_offset[i] = (T)B; + } } } else { for(int i = 0; i < head_dim; ++i){ @@ -109,6 +117,7 @@ kernel void main0(const device T* input0 [[buffer(0)]], int query_seq_len; int key_seq_len; int head_num; + int group; int head_dim; float scale; }; @@ -124,12 +133,15 @@ kernel void main0(const device T* input0 [[buffer(0)]], if (x >= param.query_seq_len || y >= param.head_num || z >= param.head_dim) { return; } + int group = param.group; + int yin = y / param.group; + int yr = y % param.group; int qk_seq_len = param.query_seq_len; int value_seq_len = param.key_seq_len; int head_num = param.head_num; int head_dim = param.head_dim; - const int offset = head_num * head_dim; - const int offset_head = y * head_dim + z; + const int stride = head_num * head_dim / group; + const int offset_head = yin * head_dim + z; #ifdef FOR_PREFILL device const T *A_offset = input0 + (y * qk_seq_len + x) * value_seq_len; device const T *B_offset = input1 + offset_head; @@ -138,11 +150,13 @@ kernel void main0(const device T* input0 [[buffer(0)]], for(int i = 0; i < value_seq_len; ++i){ float A0 = (float)A_offset[i]; - float B = (float)B_offset[i*offset]; + float B = (float)B_offset[i*stride]; out += A0 * B; - Pastvalue_offset[i*offset] = B; + if (yr == 0) { + Pastvalue_offset[i*stride] = B; + } } - output[ x * offset + (y * head_dim + z)] = out; + output[ x * stride * group + (y * head_dim + z)] = out; #else device const T *A_offset = input0 + y; device const T *B_offset = input1 + offset_head; @@ -151,12 +165,14 @@ kernel void main0(const device T* input0 [[buffer(0)]], for(int i = 0; i < value_seq_len - 1; ++i){ float A = (float)A_offset[i * head_num]; - float B = (float)Pastvalue_offset[i * offset]; + float B = (float)Pastvalue_offset[i * stride]; out += A * B; } out += (float)A_offset[(value_seq_len - 1)*head_num] * (float)B_offset[0]; - Pastvalue_offset[(value_seq_len - 1)*offset] = B_offset[0]; + if (yr == 0) { + Pastvalue_offset[(value_seq_len - 1)*stride] = B_offset[0]; + } output[(y * head_dim + z)] = (T)out; #endif @@ -194,7 +210,7 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override { const int mExpandChunk = 64; bool mIsDecode = false; std::shared_ptr mTempQK, mTempSoftMax; - int mNumHead = 0, mHeadDim = 0, mValueH = 0; + int mNumHead = 0, mHeadDim = 0, mValueH = 0, mKvNumHead = 0; id mKernel_softmax = nil; id mKernel_qk = nil; @@ -209,6 +225,7 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override { int query_seq_len; int key_seq_len; int head_num; + int group; int head_dim; float scale; }; @@ -247,13 +264,13 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override { } bool needCopy = mCache->mMaxLength > 0; - size_t old_size = mNumHead * mCache->mMaxLength * mHeadDim * byte; + size_t old_size = mKvNumHead * mCache->mMaxLength * mHeadDim * byte; mCache->mMaxLength = mCache->mPastLength + mExpandChunk; // past_key: [1, numhead, headdim, maxlen] - auto new_key = Tensor::createDevice({mCache->mMaxLength, mNumHead, mHeadDim}); + auto new_key = Tensor::createDevice({mCache->mMaxLength, mKvNumHead, mHeadDim}); // past_value: [1, numhead, maxlen, headdim] - auto new_value = Tensor::createDevice({mCache->mMaxLength, mNumHead, mHeadDim}); - size_t size = mNumHead * mCache->mMaxLength * mHeadDim * byte; + auto new_value = Tensor::createDevice({mCache->mMaxLength, mKvNumHead, mHeadDim}); + size_t size = mKvNumHead * mCache->mMaxLength * mHeadDim * byte; backend()->onAcquireBuffer(new_key, Backend::STATIC); backend()->onAcquireBuffer(new_value, Backend::STATIC); if (needCopy) { @@ -356,6 +373,10 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override { if(mIsDecode){ mCache->mKv_seq_len = mCache->mPastLength + 1; } + mKvNumHead = key->shape()[2]; + + int group_size = mNumHead / mKvNumHead; + reallocKVCache(); // Update Parameters @@ -365,6 +386,7 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override { param->head_dim = mHeadDim; param->key_seq_len = mCache->mKv_seq_len; param->head_num = mNumHead; + param->group = group_size; param->query_seq_len = seq_len; } // For softmax parameter diff --git a/source/backend/metal/MetalConvolution1x1.mm b/source/backend/metal/MetalConvolution1x1.mm index 9ca59c59c..cfd9f47b6 100644 --- a/source/backend/metal/MetalConvolution1x1.mm +++ b/source/backend/metal/MetalConvolution1x1.mm @@ -70,9 +70,12 @@ auto oc_4 = UP_DIV(output->channel(), 4); auto backend = static_cast(this->backend()); auto context = (__bridge MNNMetalContext *)backend->context(); - + int blockSize = 1; + if (mDequantScaleBias.get()) { + blockSize = (int)(mDequantScaleBias->usize() /sizeof(float) / oc_4 / 2 / 4); + } // create const buffer - int constants[] = {is, ic_4, ow, oh, os, oc_4, oc, ob, mActivationType}; + int constants[] = {is, ic_4, ow, oh, os, oc_4, oc, ob, blockSize, mActivationType}; mConstBuffer = backend->getConstBuffer(sizeof(constants)); ::memcpy(mConstBuffer.contents, constants, sizeof(constants)); diff --git a/source/backend/metal/MetalConvolutionCommon.mm b/source/backend/metal/MetalConvolutionCommon.mm index 5bdeea2b0..548aae2ef 100644 --- a/source/backend/metal/MetalConvolutionCommon.mm +++ b/source/backend/metal/MetalConvolutionCommon.mm @@ -97,35 +97,52 @@ void weightInBlock(int group, int oc, int ic, int kh, int kw, const FType *src, } } -static std::shared_ptr getDequantScale(float* scale, int size, MetalBackend *backend, bool asymmetric) { - int outputCount = 0; +static std::shared_ptr getDequantScale(const float* scale, int size, MetalBackend *backend, bool asymmetric, int oc) { + int totalCount = 0; if (asymmetric) { - outputCount = size / 2; + totalCount = size / 2; } else { - outputCount = size; + totalCount = size; } - int alignOutputCount = ALIGN_UP4(outputCount); - std::shared_ptr dequantScale(MNN::Tensor::createDevice({(int)(alignOutputCount * sizeof(float) * 2)})); + int blockSize = totalCount / oc; + int alignOutputCount = ALIGN_UP4(oc); + std::shared_ptr dequantScale(MNN::Tensor::createDevice({alignOutputCount, blockSize, (int)(sizeof(float) * 2)})); bool res = backend->onAcquireBuffer(dequantScale.get(), Backend::STATIC); if (!res) { MNN_ERROR("Buffer allocated error!\n"); return nullptr; } auto buffer0 = MetalBackend::getBuffer(dequantScale.get()); - auto dst_scale = (uint8_t*)[buffer0.first contents] + buffer0.second; - ::memset(dst_scale, 0, alignOutputCount * 2 * sizeof(float)); - auto dst_bias = dst_scale + alignOutputCount * sizeof(float); - for (int o = 0; o < outputCount; ++o) { - float min = 0.0f; - float alpha = 0.0f; - if (asymmetric) { - min = scale[2*o]; - alpha = scale[2*o+1]; - } else { - alpha = scale[o]; + auto dst_scale = (float*)((uint8_t*)[buffer0.first contents] + buffer0.second); + ::memset(dst_scale, 0, dequantScale->usize()); + if (asymmetric) { + for (int z=0; zweight.get() != nullptr) { auto backend = static_cast(this->backend()); mWeight = weightTransform(group, oc, ic, kh, kw, (float*)qnt->weight.get(), !qnt->canUseInt4, qnt->canUseInt4); - auto dequantParams = getDequantScale(qnt->alpha.get(), qnt->alpha.size(), backend, qnt->asymmetric); + auto dequantParams = getDequantScale(qnt->alpha.get(), qnt->alpha.size(), backend, qnt->asymmetric, oc); mDequantScaleBias = dequantParams; mDequantBits = qnt->canUseInt4 ? 4:8; } else if (qnt && qnt->weightFloat.size() > 0) { diff --git a/source/backend/metal/shader/MetalConvolution1x1.metal b/source/backend/metal/shader/MetalConvolution1x1.metal index 07b177dd9..21bd0d8d0 100644 --- a/source/backend/metal/shader/MetalConvolution1x1.metal +++ b/source/backend/metal/shader/MetalConvolution1x1.metal @@ -10,6 +10,7 @@ struct conv1x1_constants { int output_slice; int output_channel; int batch; + int block_size; conv_activation_type activation; }; @@ -67,30 +68,32 @@ kernel void conv1x1_g1z4_w8(const device ftype4 *in [[buffer(0)]], auto biasValue = FLOAT4(biasTerms[uz]); FLOAT4 result0 = biasValue, result1 = biasValue, result2 = biasValue, result3 = biasValue; int computeSize = min(cst.output_size - rx, CONV_UNROLL); - auto scale = FLOAT4(dequantScale[uz]); - auto dequant_bias = FLOAT4(dequantScale[uz + cst.output_slice]); - - for (auto z = 0; z < cst.input_slice; z++) { - auto in40 = (FLOAT4)*xy_in0; - auto in41 = (FLOAT4)*(xy_in0 + 1); - auto in42 = (FLOAT4)*(xy_in0 + 2); - auto in43 = (FLOAT4)*(xy_in0 + 3); - auto w = xy_wt[z]; - - /* weight int8->float */ - FLOAT4x4 w_fp32 = FLOAT4x4(FLOAT4(w[0]), FLOAT4(w[1]), FLOAT4(w[2]), FLOAT4(w[3])); - FLOAT4x4 w_dequant; - for (int i = 0; i < 4; ++i) { - w_dequant[i] = w_fp32[i] * scale[i] + dequant_bias[i]; + int block = (cst.input_slice + cst.block_size - 1) / cst.block_size; + for (int bi=0; bi 1) {xy_out[1] = activate(ftype4(result1), cst.activation); } @@ -115,31 +118,35 @@ kernel void conv1x1_g1z4_w4(const device ftype4 *in [[buffer(0)]], auto biasValue = FLOAT4(biasTerms[uz]); FLOAT4 result0 = biasValue, result1 = biasValue, result2 = biasValue, result3 = biasValue; int computeSize = min(cst.output_size - rx, CONV_UNROLL); - auto scale = FLOAT4(dequantScale[uz]); - auto dequant_bias = FLOAT4(dequantScale[uz + cst.output_slice]); - - for (auto z = 0; z < cst.input_slice; z++) { - auto in40 = (FLOAT4)*xy_in0; - auto in41 = (FLOAT4)*(xy_in0 + 1); - auto in42 = (FLOAT4)*(xy_in0 + 2); - auto in43 = (FLOAT4)*(xy_in0 + 3); - MNN::uchar4x2 w_int4 = xy_wt[z]; - // MNN::char4x4 w_int8(char4(0)); - /* weight int4->float */ - //FLOAT4x4 w_fp32 = FLOAT4x4(FLOAT4(w[0]), FLOAT4(w[1]), FLOAT4(w[2]), FLOAT4(w[3])); - FLOAT4x4 w_dequant; - for (int i = 0; i < 4; ++i) { - // ftype4 w4 = ftype4(w_fp32[i]); - FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8); - FLOAT4 res = w4 * scale[i] + dequant_bias[i]; - w_dequant[i] = res; + int block = (cst.input_slice + cst.block_size - 1) / cst.block_size; + for (int bi=0; bifloat */ + //FLOAT4x4 w_fp32 = FLOAT4x4(FLOAT4(w[0]), FLOAT4(w[1]), FLOAT4(w[2]), FLOAT4(w[3])); + FLOAT4x4 w_dequant; + for (int i = 0; i < 4; ++i) { + // ftype4 w4 = ftype4(w_fp32[i]); + FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8); + FLOAT4 res = w4 * scale[i] + dequant_bias[i]; + w_dequant[i] = res; + } + + result0 += FLOAT4(in40 * w_dequant); + result1 += FLOAT4(in41 * w_dequant); + result2 += FLOAT4(in42 * w_dequant); + result3 += FLOAT4(in43 * w_dequant); + xy_in0 += cst.input_size * cst.batch; } - - result0 += FLOAT4(in40 * w_dequant); - result1 += FLOAT4(in41 * w_dequant); - result2 += FLOAT4(in42 * w_dequant); - result3 += FLOAT4(in43 * w_dequant); - xy_in0 += cst.input_size * cst.batch; } /* true */ diff --git a/source/backend/opencl/execution/cl/opencl_program.cc b/source/backend/opencl/execution/cl/opencl_program.cc index 36938ef60..7a460a3a0 100644 --- a/source/backend/opencl/execution/cl/opencl_program.cc +++ b/source/backend/opencl/execution/cl/opencl_program.cc @@ -207,7 +207,7 @@ extern const std::map> OpenCLProgramMap #ifndef MNN_OPENCL_BUFFER_CLOSED { "attention_buf", - { 0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4d,0x4e,0x4e,0x5f,0x53,0x55,0x50,0x50,0x4f,0x52,0x54,0x5f,0x46,0x50,0x31,0x36,0xa,0x23,0x70,0x72,0x61,0x67,0x6d,0x61,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x20,0x45,0x58,0x54,0x45,0x4e,0x53,0x49,0x4f,0x4e,0x20,0x63,0x6c,0x5f,0x6b,0x68,0x72,0x5f,0x66,0x70,0x31,0x36,0x20,0x3a,0x20,0x65,0x6e,0x61,0x62,0x6c,0x65,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0xa,0x23,0x64,0x65,0x66,0x69,0x6e,0x65,0x20,0x47,0x4c,0x4f,0x42,0x41,0x4c,0x5f,0x53,0x49,0x5a,0x45,0x5f,0x33,0x5f,0x44,0x49,0x4d,0x53,0x20,0x5c,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x30,0x2c,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x31,0x2c,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x32,0x2c,0xa,0xa,0x23,0x64,0x65,0x66,0x69,0x6e,0x65,0x20,0x44,0x45,0x41,0x4c,0x5f,0x4e,0x4f,0x4e,0x5f,0x55,0x4e,0x49,0x46,0x4f,0x52,0x4d,0x5f,0x44,0x49,0x4d,0x33,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x32,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x33,0x29,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5c,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x3e,0x3d,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x30,0x20,0x7c,0x7c,0x20,0x69,0x6e,0x70,0x75,0x74,0x32,0x20,0x3e,0x3d,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x31,0x20,0x7c,0x7c,0x20,0x69,0x6e,0x70,0x75,0x74,0x33,0x20,0x3e,0x3d,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x32,0x29,0x20,0x7b,0x20,0x5c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5c,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x6d,0x61,0x74,0x6d,0x75,0x6c,0x5f,0x71,0x6b,0x5f,0x64,0x69,0x76,0x5f,0x6d,0x61,0x73,0x6b,0x28,0x47,0x4c,0x4f,0x42,0x41,0x4c,0x5f,0x53,0x49,0x5a,0x45,0x5f,0x33,0x5f,0x44,0x49,0x4d,0x53,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x2f,0x2f,0x20,0x71,0x75,0x65,0x72,0x79,0x20,0x5b,0x31,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x2f,0x2f,0x20,0x6b,0x65,0x79,0x20,0x5b,0x31,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x2f,0x2f,0x20,0x70,0x72,0x65,0x66,0x69,0x6c,0x6c,0x20,0x5b,0x31,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x34,0x5d,0x20,0x20,0x20,0x64,0x65,0x63,0x6f,0x64,0x65,0x5b,0x31,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x70,0x61,0x73,0x74,0x5f,0x6b,0x65,0x79,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x2a,0x20,0x6d,0x61,0x73,0x6b,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x31,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x66,0x6c,0x6f,0x61,0x74,0x20,0x73,0x63,0x61,0x6c,0x65,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x7b,0xa,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x78,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2f,0x20,0x34,0x20,0x66,0x6f,0x72,0x20,0x70,0x72,0x65,0x66,0x69,0x6c,0x6c,0x20,0x20,0x20,0x31,0x20,0x66,0x6f,0x72,0x20,0x64,0x65,0x63,0x6f,0x64,0x65,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x79,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x7a,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x32,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2f,0x20,0x34,0xa,0x20,0x20,0x20,0x20,0x44,0x45,0x41,0x4c,0x5f,0x4e,0x4f,0x4e,0x5f,0x55,0x4e,0x49,0x46,0x4f,0x52,0x4d,0x5f,0x44,0x49,0x4d,0x33,0x28,0x78,0x2c,0x20,0x79,0x2c,0x20,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x20,0x3d,0x20,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x61,0x73,0x74,0x5f,0x6b,0x65,0x79,0x20,0x2b,0x20,0x7a,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x7a,0x34,0x20,0x3d,0x20,0x7a,0x20,0x3c,0x3c,0x20,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x5f,0x50,0x52,0x45,0x46,0x49,0x4c,0x4c,0x5f,0x41,0x54,0x54,0x45,0x4e,0x54,0x49,0x4f,0x4e,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x2b,0x20,0x7a,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x78,0x34,0x20,0x3d,0x20,0x78,0x20,0x3c,0x3c,0x20,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x79,0x20,0x2a,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x3d,0x20,0x28,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x48,0x45,0x41,0x44,0x44,0x49,0x4d,0x5f,0x4c,0x45,0x41,0x56,0x45,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x2d,0x20,0x31,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x30,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x31,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x32,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x33,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x34,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x35,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x36,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x37,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x38,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x39,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x61,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x62,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x63,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x64,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x65,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x66,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x42,0x29,0x2c,0x20,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x28,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x30,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x31,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x32,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x33,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x29,0x2c,0x20,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x30,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x31,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x32,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x33,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x34,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x35,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x36,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x37,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x38,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x39,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x61,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x62,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x63,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x64,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x65,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x66,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x42,0x29,0x2c,0x20,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x30,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x30,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x30,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x30,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x31,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x31,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x31,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x31,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x32,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x32,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x32,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x32,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x33,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x33,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x33,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x33,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x33,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x30,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x20,0x3e,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x31,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x20,0x3e,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x32,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x20,0x3e,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x33,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x3d,0x20,0x28,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x48,0x45,0x41,0x44,0x44,0x49,0x4d,0x5f,0x4c,0x45,0x41,0x56,0x45,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x2d,0x20,0x31,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x30,0x2c,0x20,0x42,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x34,0x2c,0x20,0x42,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x38,0x2c,0x20,0x42,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x63,0x2c,0x20,0x42,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x28,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x30,0x2c,0x20,0x42,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x30,0x2c,0x20,0x42,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x34,0x2c,0x20,0x42,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x38,0x2c,0x20,0x42,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x63,0x2c,0x20,0x42,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x20,0x3d,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x72,0x65,0x6d,0x61,0x69,0x6e,0x20,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x7a,0x20,0x2a,0x20,0x34,0x20,0x2d,0x20,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x3d,0x20,0x72,0x65,0x6d,0x61,0x69,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x74,0x6d,0x70,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x41,0x20,0x3d,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x2a,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x2a,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x42,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x74,0x6d,0x70,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x6f,0x75,0x74,0x5f,0x70,0x74,0x72,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x29,0x26,0x6f,0x75,0x74,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x5f,0x70,0x74,0x72,0x5b,0x72,0x65,0x6d,0x61,0x69,0x6e,0x5d,0x20,0x3d,0x20,0x74,0x6d,0x70,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x79,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x7a,0x34,0x29,0x3b,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x6d,0x61,0x74,0x6d,0x75,0x6c,0x5f,0x71,0x6b,0x76,0x28,0x47,0x4c,0x4f,0x42,0x41,0x4c,0x5f,0x53,0x49,0x5a,0x45,0x5f,0x33,0x5f,0x44,0x49,0x4d,0x53,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x2f,0x2f,0x20,0x71,0x6b,0x20,0x70,0x72,0x65,0x66,0x69,0x6c,0x6c,0x20,0x5b,0x31,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x34,0x5d,0x20,0x20,0x20,0x64,0x65,0x63,0x6f,0x64,0x65,0x5b,0x31,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x2a,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x31,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x70,0x61,0x73,0x74,0x5f,0x76,0x61,0x6c,0x75,0x65,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x7b,0xa,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x78,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x70,0x72,0x65,0x66,0x69,0x6c,0x6c,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2f,0x20,0x34,0x20,0x20,0x20,0x64,0x65,0x63,0x6f,0x64,0x65,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x79,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x7a,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x32,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x3c,0x3c,0x20,0x32,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x7a,0x34,0x20,0x3d,0x20,0x7a,0x20,0x3c,0x3c,0x20,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x44,0x45,0x41,0x4c,0x5f,0x4e,0x4f,0x4e,0x5f,0x55,0x4e,0x49,0x46,0x4f,0x52,0x4d,0x5f,0x44,0x49,0x4d,0x33,0x28,0x78,0x2c,0x20,0x79,0x2c,0x20,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x5f,0x50,0x52,0x45,0x46,0x49,0x4c,0x4c,0x5f,0x41,0x54,0x54,0x45,0x4e,0x54,0x49,0x4f,0x4e,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x20,0x3d,0x20,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2b,0x20,0x78,0x29,0x20,0x2a,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x61,0x73,0x74,0x5f,0x76,0x61,0x6c,0x75,0x65,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x69,0x6e,0x64,0x65,0x78,0x20,0x3d,0x20,0x69,0x20,0x3c,0x3c,0x20,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x30,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x6e,0x64,0x65,0x78,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x31,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x31,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x32,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x32,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x33,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x33,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x69,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x30,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x31,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x31,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x32,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x32,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x33,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x34,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x31,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x35,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x32,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x36,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x37,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x38,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x31,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x39,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x32,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x61,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x62,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x63,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x31,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x64,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x32,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x65,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x66,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x42,0x29,0x2c,0x20,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x69,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x48,0x45,0x41,0x44,0x44,0x49,0x4d,0x5f,0x4c,0x45,0x41,0x56,0x45,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x70,0x74,0x72,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x29,0x26,0x42,0x3b,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x2c,0x20,0x6a,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x3b,0x20,0x2b,0x2b,0x69,0x2c,0x20,0x2b,0x2b,0x6a,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x30,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x34,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x38,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x31,0x32,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x30,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x2e,0x73,0x30,0x31,0x32,0x33,0x29,0x2c,0x20,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x31,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x2e,0x73,0x34,0x35,0x36,0x37,0x29,0x2c,0x20,0x31,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x32,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x2e,0x73,0x38,0x39,0x61,0x62,0x29,0x2c,0x20,0x32,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x33,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x2e,0x73,0x63,0x64,0x65,0x66,0x29,0x2c,0x20,0x33,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x42,0x29,0x2c,0x20,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x70,0x74,0x72,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x29,0x26,0x42,0x3b,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x2c,0x20,0x6a,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x3b,0x20,0x2b,0x2b,0x69,0x2c,0x20,0x2b,0x2b,0x6a,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x30,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x34,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x38,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x31,0x32,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x29,0x28,0x6f,0x75,0x74,0x30,0x2c,0x20,0x6f,0x75,0x74,0x31,0x2c,0x20,0x6f,0x75,0x74,0x32,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x20,0x3d,0x20,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6c,0x6f,0x6f,0x70,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x32,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x20,0x2b,0x20,0x79,0x20,0x2a,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x61,0x73,0x74,0x5f,0x76,0x61,0x6c,0x75,0x65,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x6c,0x6f,0x6f,0x70,0x20,0x2d,0x20,0x31,0x3b,0x20,0x69,0x2b,0x2b,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x69,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x64,0x6f,0x74,0x28,0x41,0x2c,0x20,0x42,0x2e,0x73,0x30,0x31,0x32,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x64,0x6f,0x74,0x28,0x41,0x2c,0x20,0x42,0x2e,0x73,0x34,0x35,0x36,0x37,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x64,0x6f,0x74,0x28,0x41,0x2c,0x20,0x42,0x2e,0x73,0x38,0x39,0x61,0x62,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x64,0x6f,0x74,0x28,0x41,0x2c,0x20,0x42,0x2e,0x73,0x63,0x64,0x65,0x66,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x73,0x74,0x61,0x72,0x74,0x20,0x3d,0x20,0x28,0x6c,0x6f,0x6f,0x70,0x20,0x2d,0x20,0x31,0x29,0x20,0x3c,0x20,0x30,0x20,0x3f,0x20,0x30,0x20,0x3a,0x20,0x28,0x6c,0x6f,0x6f,0x70,0x20,0x2d,0x20,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x5f,0x56,0x65,0x63,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x73,0x74,0x61,0x72,0x74,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x70,0x74,0x72,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x29,0x26,0x42,0x5f,0x56,0x65,0x63,0x3b,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x73,0x74,0x61,0x72,0x74,0x20,0x2a,0x20,0x34,0x3b,0x20,0x69,0x20,0x3c,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x31,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x41,0x20,0x3d,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x69,0x6e,0x64,0x65,0x78,0x20,0x3d,0x20,0x69,0x20,0x25,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x5f,0x70,0x74,0x72,0x5b,0x69,0x6e,0x64,0x65,0x78,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x5f,0x70,0x74,0x72,0x5b,0x69,0x6e,0x64,0x65,0x78,0x2b,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x5f,0x70,0x74,0x72,0x5b,0x69,0x6e,0x64,0x65,0x78,0x2b,0x38,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x5f,0x70,0x74,0x72,0x5b,0x69,0x6e,0x64,0x65,0x78,0x2b,0x31,0x32,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x41,0x20,0x3d,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x31,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x30,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x30,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x31,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x32,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x38,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x33,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x31,0x32,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x69,0x6e,0x64,0x65,0x78,0x20,0x3d,0x20,0x28,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x31,0x29,0x20,0x3e,0x3e,0x20,0x32,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x31,0x29,0x20,0x25,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x48,0x45,0x41,0x44,0x44,0x49,0x4d,0x5f,0x4c,0x45,0x41,0x56,0x45,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x5d,0x20,0x3d,0x20,0x42,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x34,0x5d,0x20,0x3d,0x20,0x42,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x38,0x5d,0x20,0x3d,0x20,0x42,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x31,0x32,0x5d,0x20,0x3d,0x20,0x42,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x3b,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x5d,0x20,0x3d,0x20,0x42,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x34,0x5d,0x20,0x3d,0x20,0x42,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x38,0x5d,0x20,0x3d,0x20,0x42,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x31,0x32,0x5d,0x20,0x3d,0x20,0x42,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x3b,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x20,0x20,0x20,0x20,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x7d,0xa,0xa, } + { 0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4d,0x4e,0x4e,0x5f,0x53,0x55,0x50,0x50,0x4f,0x52,0x54,0x5f,0x46,0x50,0x31,0x36,0xa,0x23,0x70,0x72,0x61,0x67,0x6d,0x61,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x20,0x45,0x58,0x54,0x45,0x4e,0x53,0x49,0x4f,0x4e,0x20,0x63,0x6c,0x5f,0x6b,0x68,0x72,0x5f,0x66,0x70,0x31,0x36,0x20,0x3a,0x20,0x65,0x6e,0x61,0x62,0x6c,0x65,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0xa,0x23,0x64,0x65,0x66,0x69,0x6e,0x65,0x20,0x47,0x4c,0x4f,0x42,0x41,0x4c,0x5f,0x53,0x49,0x5a,0x45,0x5f,0x33,0x5f,0x44,0x49,0x4d,0x53,0x20,0x5c,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x30,0x2c,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x31,0x2c,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x32,0x2c,0xa,0xa,0x23,0x64,0x65,0x66,0x69,0x6e,0x65,0x20,0x44,0x45,0x41,0x4c,0x5f,0x4e,0x4f,0x4e,0x5f,0x55,0x4e,0x49,0x46,0x4f,0x52,0x4d,0x5f,0x44,0x49,0x4d,0x33,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x32,0x2c,0x20,0x69,0x6e,0x70,0x75,0x74,0x33,0x29,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5c,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x20,0x28,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x3e,0x3d,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x30,0x20,0x7c,0x7c,0x20,0x69,0x6e,0x70,0x75,0x74,0x32,0x20,0x3e,0x3d,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x31,0x20,0x7c,0x7c,0x20,0x69,0x6e,0x70,0x75,0x74,0x33,0x20,0x3e,0x3d,0x20,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x73,0x69,0x7a,0x65,0x5f,0x64,0x69,0x6d,0x32,0x29,0x20,0x7b,0x20,0x5c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5c,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x6d,0x61,0x74,0x6d,0x75,0x6c,0x5f,0x71,0x6b,0x5f,0x64,0x69,0x76,0x5f,0x6d,0x61,0x73,0x6b,0x28,0x47,0x4c,0x4f,0x42,0x41,0x4c,0x5f,0x53,0x49,0x5a,0x45,0x5f,0x33,0x5f,0x44,0x49,0x4d,0x53,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x2f,0x2f,0x20,0x71,0x75,0x65,0x72,0x79,0x20,0x5b,0x31,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x2f,0x2f,0x20,0x6b,0x65,0x79,0x20,0x5b,0x31,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x2f,0x2f,0x20,0x70,0x72,0x65,0x66,0x69,0x6c,0x6c,0x20,0x5b,0x31,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x34,0x5d,0x20,0x20,0x20,0x64,0x65,0x63,0x6f,0x64,0x65,0x5b,0x31,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x70,0x61,0x73,0x74,0x5f,0x6b,0x65,0x79,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x41,0x44,0x44,0x5f,0x4d,0x41,0x53,0x4b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x20,0x6d,0x61,0x73,0x6b,0x2c,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x2a,0x20,0x6d,0x61,0x73,0x6b,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x31,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x34,0x5d,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x66,0x6c,0x6f,0x61,0x74,0x20,0x73,0x63,0x61,0x6c,0x65,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x7b,0xa,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x78,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2f,0x20,0x34,0x20,0x66,0x6f,0x72,0x20,0x70,0x72,0x65,0x66,0x69,0x6c,0x6c,0x20,0x20,0x20,0x31,0x20,0x66,0x6f,0x72,0x20,0x64,0x65,0x63,0x6f,0x64,0x65,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x79,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x7a,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x32,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2f,0x20,0x34,0xa,0x20,0x20,0x20,0x20,0x44,0x45,0x41,0x4c,0x5f,0x4e,0x4f,0x4e,0x5f,0x55,0x4e,0x49,0x46,0x4f,0x52,0x4d,0x5f,0x44,0x49,0x4d,0x33,0x28,0x78,0x2c,0x20,0x79,0x2c,0x20,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x20,0x3d,0x20,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x61,0x73,0x74,0x5f,0x6b,0x65,0x79,0x20,0x2b,0x20,0x7a,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x7a,0x34,0x20,0x3d,0x20,0x7a,0x20,0x3c,0x3c,0x20,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x5f,0x50,0x52,0x45,0x46,0x49,0x4c,0x4c,0x5f,0x41,0x54,0x54,0x45,0x4e,0x54,0x49,0x4f,0x4e,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x2b,0x20,0x7a,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x78,0x34,0x20,0x3d,0x20,0x78,0x20,0x3c,0x3c,0x20,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x79,0x20,0x2a,0x20,0x71,0x75,0x65,0x72,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x3d,0x20,0x28,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x48,0x45,0x41,0x44,0x44,0x49,0x4d,0x5f,0x4c,0x45,0x41,0x56,0x45,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x2d,0x20,0x31,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x30,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x31,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x32,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x33,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x34,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x35,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x36,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x37,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x38,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x39,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x61,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x62,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x63,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x64,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x65,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x66,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x42,0x29,0x2c,0x20,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x28,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x30,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x31,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x32,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x33,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x29,0x2c,0x20,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x30,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x31,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x32,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x33,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x34,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x35,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x36,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x37,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x38,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x39,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x61,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x62,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x63,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x64,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x65,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x66,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x42,0x29,0x2c,0x20,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x41,0x44,0x44,0x5f,0x4d,0x41,0x53,0x4b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x3b,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x30,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x30,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x30,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x30,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x30,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x31,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x31,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x31,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x31,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x32,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x32,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x32,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x32,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x33,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x30,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x30,0x2e,0x73,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x33,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x31,0x2e,0x73,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x33,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x32,0x2e,0x73,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x33,0x20,0x3d,0x20,0x6d,0x61,0x73,0x6b,0x5b,0x28,0x28,0x78,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x3d,0x20,0x30,0x20,0x3f,0x20,0x2d,0x46,0x4c,0x54,0x5f,0x4d,0x41,0x58,0x20,0x3a,0x20,0x6f,0x75,0x74,0x33,0x2e,0x73,0x33,0x3b,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x30,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x20,0x3e,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x31,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x20,0x3e,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x32,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x20,0x3e,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x33,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x3d,0x20,0x28,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x48,0x45,0x41,0x44,0x44,0x49,0x4d,0x5f,0x4c,0x45,0x41,0x56,0x45,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x2d,0x20,0x31,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x30,0x2c,0x20,0x42,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x34,0x2c,0x20,0x42,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x38,0x2c,0x20,0x42,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x63,0x2c,0x20,0x42,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x28,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x30,0x2c,0x20,0x42,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x34,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x69,0x2c,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x30,0x2c,0x20,0x42,0x2e,0x73,0x30,0x31,0x32,0x33,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x34,0x2c,0x20,0x42,0x2e,0x73,0x34,0x35,0x36,0x37,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x38,0x2c,0x20,0x42,0x2e,0x73,0x38,0x39,0x61,0x62,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x41,0x2e,0x73,0x63,0x2c,0x20,0x42,0x2e,0x73,0x63,0x64,0x65,0x66,0x2c,0x20,0x6f,0x75,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x20,0x3d,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x72,0x65,0x6d,0x61,0x69,0x6e,0x20,0x3d,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x7a,0x20,0x2a,0x20,0x34,0x20,0x2d,0x20,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x3d,0x20,0x72,0x65,0x6d,0x61,0x69,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x74,0x6d,0x70,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x41,0x20,0x3d,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x2a,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x2a,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x6b,0x65,0x79,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x42,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x74,0x6d,0x70,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x6f,0x75,0x74,0x5f,0x70,0x74,0x72,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x29,0x26,0x6f,0x75,0x74,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x5f,0x70,0x74,0x72,0x5b,0x72,0x65,0x6d,0x61,0x69,0x6e,0x5d,0x20,0x3d,0x20,0x74,0x6d,0x70,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x20,0x2a,0x3d,0x20,0x56,0x73,0x63,0x61,0x6c,0x65,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x79,0x20,0x2a,0x20,0x6b,0x65,0x79,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x7a,0x34,0x29,0x3b,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x7d,0xa,0xa,0x5f,0x5f,0x6b,0x65,0x72,0x6e,0x65,0x6c,0x20,0x76,0x6f,0x69,0x64,0x20,0x6d,0x61,0x74,0x6d,0x75,0x6c,0x5f,0x71,0x6b,0x76,0x28,0x47,0x4c,0x4f,0x42,0x41,0x4c,0x5f,0x53,0x49,0x5a,0x45,0x5f,0x33,0x5f,0x44,0x49,0x4d,0x53,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x69,0x6e,0x70,0x75,0x74,0x30,0x2c,0x20,0x2f,0x2f,0x20,0x71,0x6b,0x20,0x70,0x72,0x65,0x66,0x69,0x6c,0x6c,0x20,0x5b,0x31,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x34,0x5d,0x20,0x20,0x20,0x64,0x65,0x63,0x6f,0x64,0x65,0x5b,0x31,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x69,0x6e,0x70,0x75,0x74,0x31,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x6f,0x75,0x74,0x70,0x75,0x74,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x2a,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x31,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x70,0x61,0x73,0x74,0x5f,0x76,0x61,0x6c,0x75,0x65,0x2c,0x20,0x2f,0x2f,0x20,0x5b,0x31,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2f,0x34,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x34,0x5d,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x2c,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x5f,0x5f,0x70,0x72,0x69,0x76,0x61,0x74,0x65,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x7b,0xa,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x78,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x30,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x70,0x72,0x65,0x66,0x69,0x6c,0x6c,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2f,0x20,0x34,0x20,0x20,0x20,0x64,0x65,0x63,0x6f,0x64,0x65,0x20,0x31,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x79,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x31,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x7a,0x20,0x3d,0x20,0x67,0x65,0x74,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x5f,0x69,0x64,0x28,0x32,0x29,0x3b,0x20,0x2f,0x2f,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x3c,0x3c,0x20,0x32,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x7a,0x34,0x20,0x3d,0x20,0x7a,0x20,0x3c,0x3c,0x20,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x44,0x45,0x41,0x4c,0x5f,0x4e,0x4f,0x4e,0x5f,0x55,0x4e,0x49,0x46,0x4f,0x52,0x4d,0x5f,0x44,0x49,0x4d,0x33,0x28,0x78,0x2c,0x20,0x79,0x2c,0x20,0x7a,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x4f,0x50,0x45,0x4e,0x43,0x4c,0x5f,0x50,0x52,0x45,0x46,0x49,0x4c,0x4c,0x5f,0x41,0x54,0x54,0x45,0x4e,0x54,0x49,0x4f,0x4e,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x20,0x3d,0x20,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x71,0x6b,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2b,0x20,0x78,0x29,0x20,0x2a,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x61,0x73,0x74,0x5f,0x76,0x61,0x6c,0x75,0x65,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x69,0x6e,0x64,0x65,0x78,0x20,0x3d,0x20,0x69,0x20,0x3c,0x3c,0x20,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x30,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x6e,0x64,0x65,0x78,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x31,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x31,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x32,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x32,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x33,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x33,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x69,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x30,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x31,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x31,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x32,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x32,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x33,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x34,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x31,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x35,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x32,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x36,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x37,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x38,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x31,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x39,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x32,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x61,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x62,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x63,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x31,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x64,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x32,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x65,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x33,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x2e,0x73,0x66,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x42,0x29,0x2c,0x20,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x69,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x48,0x45,0x41,0x44,0x44,0x49,0x4d,0x5f,0x4c,0x45,0x41,0x56,0x45,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x70,0x74,0x72,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x29,0x26,0x42,0x3b,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x2c,0x20,0x6a,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x3b,0x20,0x2b,0x2b,0x69,0x2c,0x20,0x2b,0x2b,0x6a,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x30,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x34,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x38,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x31,0x32,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x30,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x2e,0x73,0x30,0x31,0x32,0x33,0x29,0x2c,0x20,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x31,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x2e,0x73,0x34,0x35,0x36,0x37,0x29,0x2c,0x20,0x31,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x32,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x2e,0x73,0x38,0x39,0x61,0x62,0x29,0x2c,0x20,0x32,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x6f,0x75,0x74,0x33,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x34,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x42,0x2e,0x73,0x63,0x64,0x65,0x66,0x29,0x2c,0x20,0x33,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x42,0x29,0x2c,0x20,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x70,0x74,0x72,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x2a,0x29,0x26,0x42,0x3b,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2d,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x2c,0x20,0x6a,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x3b,0x20,0x2b,0x2b,0x69,0x2c,0x20,0x2b,0x2b,0x6a,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x30,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x30,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x30,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x31,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x34,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x32,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x38,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x32,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x33,0x20,0x3d,0x20,0x6d,0x61,0x64,0x28,0x41,0x30,0x2c,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x29,0x42,0x5f,0x70,0x74,0x72,0x5b,0x6a,0x20,0x2b,0x20,0x31,0x32,0x5d,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x76,0x73,0x74,0x6f,0x72,0x65,0x31,0x36,0x28,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x29,0x28,0x6f,0x75,0x74,0x30,0x2c,0x20,0x6f,0x75,0x74,0x31,0x2c,0x20,0x6f,0x75,0x74,0x32,0x2c,0x20,0x6f,0x75,0x74,0x33,0x29,0x29,0x2c,0x20,0x30,0x2c,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x20,0x2b,0x20,0x78,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x29,0x20,0x2a,0x20,0x34,0x29,0x3b,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x33,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x6e,0x75,0x6d,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x20,0x3d,0x20,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2a,0x20,0x34,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x69,0x6e,0x74,0x20,0x6c,0x6f,0x6f,0x70,0x20,0x3d,0x20,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2b,0x20,0x32,0x29,0x20,0x2f,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x30,0x20,0x2b,0x20,0x79,0x20,0x2a,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x34,0x20,0x2a,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x63,0x6f,0x6e,0x73,0x74,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x69,0x6e,0x70,0x75,0x74,0x31,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x5f,0x5f,0x67,0x6c,0x6f,0x62,0x61,0x6c,0x20,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x3d,0x20,0x70,0x61,0x73,0x74,0x5f,0x76,0x61,0x6c,0x75,0x65,0x20,0x2b,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x5f,0x68,0x65,0x61,0x64,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x6f,0x75,0x74,0x20,0x3d,0x20,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x30,0x3b,0x20,0x69,0x20,0x3c,0x20,0x6c,0x6f,0x6f,0x70,0x20,0x2d,0x20,0x31,0x3b,0x20,0x69,0x2b,0x2b,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x20,0x41,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x34,0x28,0x76,0x6c,0x6f,0x61,0x64,0x34,0x28,0x69,0x2c,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x69,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x64,0x6f,0x74,0x28,0x41,0x2c,0x20,0x42,0x2e,0x73,0x30,0x31,0x32,0x33,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x64,0x6f,0x74,0x28,0x41,0x2c,0x20,0x42,0x2e,0x73,0x34,0x35,0x36,0x37,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x64,0x6f,0x74,0x28,0x41,0x2c,0x20,0x42,0x2e,0x73,0x38,0x39,0x61,0x62,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x64,0x6f,0x74,0x28,0x41,0x2c,0x20,0x42,0x2e,0x73,0x63,0x64,0x65,0x66,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x73,0x74,0x61,0x72,0x74,0x20,0x3d,0x20,0x28,0x6c,0x6f,0x6f,0x70,0x20,0x2d,0x20,0x31,0x29,0x20,0x3c,0x20,0x30,0x20,0x3f,0x20,0x30,0x20,0x3a,0x20,0x28,0x6c,0x6f,0x6f,0x70,0x20,0x2d,0x20,0x31,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x20,0x42,0x5f,0x56,0x65,0x63,0x20,0x3d,0x20,0x43,0x4f,0x4e,0x56,0x45,0x52,0x54,0x5f,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x31,0x36,0x28,0x76,0x6c,0x6f,0x61,0x64,0x31,0x36,0x28,0x30,0x2c,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x73,0x74,0x61,0x72,0x74,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x29,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x42,0x5f,0x70,0x74,0x72,0x20,0x3d,0x20,0x28,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x2a,0x29,0x26,0x42,0x5f,0x56,0x65,0x63,0x3b,0xa,0x20,0x20,0x20,0x20,0x66,0x6f,0x72,0x28,0x69,0x6e,0x74,0x20,0x69,0x20,0x3d,0x20,0x73,0x74,0x61,0x72,0x74,0x20,0x2a,0x20,0x34,0x3b,0x20,0x69,0x20,0x3c,0x20,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x31,0x3b,0x20,0x2b,0x2b,0x69,0x29,0x7b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x41,0x20,0x3d,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x69,0x6e,0x64,0x65,0x78,0x20,0x3d,0x20,0x69,0x20,0x25,0x20,0x34,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x5f,0x70,0x74,0x72,0x5b,0x69,0x6e,0x64,0x65,0x78,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x5f,0x70,0x74,0x72,0x5b,0x69,0x6e,0x64,0x65,0x78,0x2b,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x5f,0x70,0x74,0x72,0x5b,0x69,0x6e,0x64,0x65,0x78,0x2b,0x38,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x5f,0x70,0x74,0x72,0x5b,0x69,0x6e,0x64,0x65,0x78,0x2b,0x31,0x32,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x7d,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x41,0x20,0x3d,0x20,0x41,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x31,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x30,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x30,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x31,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x34,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x32,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x38,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x43,0x4f,0x4d,0x50,0x55,0x54,0x45,0x5f,0x46,0x4c,0x4f,0x41,0x54,0x20,0x42,0x33,0x20,0x3d,0x20,0x42,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x31,0x32,0x5d,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x20,0x2b,0x3d,0x20,0x41,0x20,0x2a,0x20,0x42,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x6e,0x74,0x20,0x69,0x6e,0x64,0x65,0x78,0x20,0x3d,0x20,0x28,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x31,0x29,0x20,0x3e,0x3e,0x20,0x32,0x29,0x20,0x2a,0x20,0x6f,0x66,0x66,0x73,0x65,0x74,0x20,0x2b,0x20,0x28,0x28,0x76,0x61,0x6c,0x75,0x65,0x5f,0x73,0x65,0x71,0x5f,0x6c,0x65,0x6e,0x20,0x2d,0x20,0x31,0x29,0x20,0x25,0x20,0x34,0x29,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x23,0x69,0x66,0x64,0x65,0x66,0x20,0x48,0x45,0x41,0x44,0x44,0x49,0x4d,0x5f,0x4c,0x45,0x41,0x56,0x45,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x5d,0x20,0x3d,0x20,0x42,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x31,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x34,0x5d,0x20,0x3d,0x20,0x42,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x32,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x38,0x5d,0x20,0x3d,0x20,0x42,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x69,0x66,0x28,0x7a,0x34,0x20,0x2b,0x20,0x33,0x20,0x3e,0x3d,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x29,0x20,0x72,0x65,0x74,0x75,0x72,0x6e,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x31,0x32,0x5d,0x20,0x3d,0x20,0x42,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x3b,0xa,0x23,0x65,0x6c,0x73,0x65,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x5d,0x20,0x3d,0x20,0x42,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x34,0x5d,0x20,0x3d,0x20,0x42,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x38,0x5d,0x20,0x3d,0x20,0x42,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x50,0x61,0x73,0x74,0x76,0x61,0x6c,0x75,0x65,0x5f,0x6f,0x66,0x66,0x73,0x65,0x74,0x5b,0x69,0x6e,0x64,0x65,0x78,0x20,0x2b,0x20,0x31,0x32,0x5d,0x20,0x3d,0x20,0x42,0x33,0x3b,0xa,0x20,0x20,0x20,0x20,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x30,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x31,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x31,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x32,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x32,0x3b,0xa,0x20,0x20,0x20,0x20,0x6f,0x75,0x74,0x70,0x75,0x74,0x5b,0x28,0x79,0x20,0x2a,0x20,0x68,0x65,0x61,0x64,0x5f,0x64,0x69,0x6d,0x20,0x2b,0x20,0x7a,0x34,0x20,0x2b,0x20,0x33,0x29,0x20,0x2a,0x20,0x34,0x5d,0x20,0x3d,0x20,0x6f,0x75,0x74,0x2e,0x73,0x33,0x3b,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x20,0x20,0x20,0x20,0xa,0x23,0x65,0x6e,0x64,0x69,0x66,0xa,0x7d,0xa,0xa, } }, #endif #ifndef MNN_OPENCL_BUFFER_CLOSED diff --git a/source/core/ConvolutionCommon.cpp b/source/core/ConvolutionCommon.cpp index 9070fe9c7..6a333f0fa 100644 --- a/source/core/ConvolutionCommon.cpp +++ b/source/core/ConvolutionCommon.cpp @@ -53,6 +53,7 @@ std::shared_ptr ConvolutionCommon::load(const Con size_t weightLength = 0; int8_t *buffer = nullptr; auto originBuffer = (unsigned char *)buffer_ptr; + if (1 == quan->type()) { buffer = IDSTDecoder::ReadQuanData_c(originBuffer, &weightLength, result.get(), quan->shapeInt32()); } diff --git a/source/core/IDSTEncoder.hpp b/source/core/IDSTEncoder.hpp index 2d24b56b5..4c72f2d29 100644 --- a/source/core/IDSTEncoder.hpp +++ b/source/core/IDSTEncoder.hpp @@ -421,32 +421,44 @@ static bool WriteSparseQuanBlobs(std::ostream &out, const float* weightData, con static std::unique_ptr encode(const float* weight, const std::vector& scale, int kernelSize, int kernelNum, bool asymmetricQuantFlag, const int8_t* quantWeightPtr, const int clampMin, const int bits = 8, bool detectSparse = true) { - std::ostringstream outputStringStreamCQ, outputStringStreamSQ; - bool shapeUseInt32 = false; - WriteCQBlobs(outputStringStreamCQ, weight, scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32, bits); - bool sparseValid = false; - if (detectSparse) { - sparseValid = WriteSparseQuanBlobs(outputStringStreamSQ, weight, scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32, bits); + // compute block_size + + int alpha_size = scale.size(), block_size = kernelSize, block_num = 1; + if (asymmetricQuantFlag) alpha_size /= 2; + if (alpha_size > kernelNum) { + block_num = alpha_size / kernelNum; + block_size = kernelSize / block_num; } + bool shapeUseInt32 = false; std::unique_ptr idst(new IDSTQuanT); + std::ostringstream outputStringStreamCQ; + WriteCQBlobs(outputStringStreamCQ, weight, scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32, bits); auto cqStr = outputStringStreamCQ.str(); - auto sqStr = outputStringStreamSQ.str(); - int int8Size = kernelNum * kernelSize; - idst->shapeInt32 = shapeUseInt32; - if (quantWeightPtr && (int8Size <= cqStr.size() && int8Size <= sqStr.size())) { - idst->type = 4; - idst->aMax = kernelNum; - idst->buffer.resize(int8Size); - ::memcpy(idst->buffer.data(), quantWeightPtr, int8Size); - } else if (cqStr.size() <= sqStr.size() || (!sparseValid)) { + if (detectSparse) { + std::ostringstream outputStringStreamSQ; + bool sparseValid = WriteSparseQuanBlobs(outputStringStreamSQ, weight, scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32, bits); + auto sqStr = outputStringStreamSQ.str(); + int int8Size = kernelNum * kernelSize; + if (quantWeightPtr && (int8Size <= cqStr.size() && int8Size <= sqStr.size())) { + idst->type = 4; + idst->aMax = kernelNum; + idst->buffer.resize(int8Size); + ::memcpy(idst->buffer.data(), quantWeightPtr, int8Size); + } else if (cqStr.size() <= sqStr.size() || (!sparseValid)) { + idst->type = 1; + idst->buffer.resize(cqStr.size()); + ::memcpy(idst->buffer.data(), cqStr.data(), cqStr.size()); + } else { + idst->type = 2; + idst->buffer.resize(sqStr.size()); + ::memcpy(idst->buffer.data(), sqStr.data(), sqStr.size()); + } + } else { idst->type = 1; idst->buffer.resize(cqStr.size()); ::memcpy(idst->buffer.data(), cqStr.data(), cqStr.size()); - } else { - idst->type = 2; - idst->buffer.resize(sqStr.size()); - ::memcpy(idst->buffer.data(), sqStr.data(), sqStr.size()); } + idst->shapeInt32 = shapeUseInt32; idst->alpha.resize(scale.size()); ::memcpy(idst->alpha.data(), scale.data(), scale.size() * sizeof(float)); idst->quantScale = 1.f; diff --git a/source/core/Interpreter.cpp b/source/core/Interpreter.cpp index e9ec694cb..5078d1493 100644 --- a/source/core/Interpreter.cpp +++ b/source/core/Interpreter.cpp @@ -144,36 +144,11 @@ Interpreter* Interpreter::createFromBufferInternal(Content* net, bool enforceAut } void Interpreter::setSessionHint(HintMode mode, int hint) { - switch (mode) { - case MAX_TUNING_NUMBER: - mNet->modes.maxTuningNumber = hint; - break; - case MEM_ALLOCATOR_TYPE: - mNet->modes.memoryAllocatorType = hint; - break; - case WINOGRAD_MEMORY_LEVEL: - mNet->modes.winogradMemoryUsed = hint; - default: - break; - } + mNet->modes.setHint(mode, hint); } void Interpreter::setSessionMode(SessionMode mode) { - if (mode == Session_Input_Inside || mode == Session_Input_User) { - mNet->modes.inputMode = mode; - } else if (mode == Session_Output_User || mode == Session_Output_Inside) { - mNet->modes.outputMode = mode; - } else if (mode == Session_Backend_Auto || mode == Session_Backend_Fix) { - mNet->modes.backendMode = mode; - } else if (mode == Session_Debug || mode == Session_Release) { - mNet->modes.callBackMode = mode; - } else if (mode == Session_Resize_Direct || mode == Session_Resize_Defer) { - mNet->modes.resizeMode = mode; - } else if(mode == Session_Memory_Collect || mode == Session_Memory_Cache) { - mNet->modes.memoryUsageMode = mode; - } else if(mode == Session_Codegen_Disable || mode == Session_Codegen_Enable) { - mNet->modes.codegenMode = mode; - } else if (mode == Session_Resize_Check) { + if (mode == Session_Resize_Check) { for (auto& iter : mNet->sessions) { iter->openResizeCheck(); } @@ -181,6 +156,8 @@ void Interpreter::setSessionMode(SessionMode mode) { for (auto& iter : mNet->sessions) { iter->fixResizeCache(); } + } else { + mNet->modes.setMode(mode); } } diff --git a/source/core/Pipeline.cpp b/source/core/Pipeline.cpp index f2e030e5a..7033108c3 100644 --- a/source/core/Pipeline.cpp +++ b/source/core/Pipeline.cpp @@ -205,9 +205,9 @@ void Pipeline::UnitInfo::setUp(const Command& command, int index, const Op* orig #endif } -Pipeline::Pipeline(const std::string& externalFile, Schedule::PipelineInfo&& info, bool allocInput, bool outputStatic, const TuningAttr& tune, const Runtime* rt, const Runtime* cpuRt) +Pipeline::Pipeline(const std::string& externalFile, Schedule::PipelineInfo&& info, bool allocInput, bool outputStatic, const TuningAttr& tune, const Runtime* rt, const Runtime* cpuRt, int geometryMask) #ifndef MNN_BUILD_MINI - : mContext(info.first.cache.second, info.first.cache.first->type(), info.first.info.user ? info.first.info.user->precision : BackendConfig::Precision_Normal), mUseGeometry(rt->onGetCompilerType()) { + : mContext(geometryMask, info.first.cache.second, info.first.cache.first->type(), info.first.info.user ? info.first.info.user->precision : BackendConfig::Precision_Normal), mUseGeometry(rt->onGetCompilerType()) { #else { #endif diff --git a/source/core/Pipeline.hpp b/source/core/Pipeline.hpp index c32701db5..6fb9543d3 100644 --- a/source/core/Pipeline.hpp +++ b/source/core/Pipeline.hpp @@ -27,7 +27,7 @@ class Pipeline : public NonCopyable { bool autoSetOpType; int maxTuningNumber; }; - Pipeline(const std::string& externalFile, Schedule::PipelineInfo&& info, bool allocInput, bool outputStatic, const TuningAttr& tune, const Runtime* rt, const Runtime* cpuRt); + Pipeline(const std::string& externalFile, Schedule::PipelineInfo&& info, bool allocInput, bool outputStatic, const TuningAttr& tune, const Runtime* rt, const Runtime* cpuRt, int geometryMask); ~Pipeline(); ErrorCode fixResizeCache(); void openResizeCheck(); diff --git a/source/core/Session.cpp b/source/core/Session.cpp index 5998c9253..9ab6b460c 100644 --- a/source/core/Session.cpp +++ b/source/core/Session.cpp @@ -44,6 +44,44 @@ static void _createPipelineBackend(Schedule::PipelineInfo& iter, RuntimeInfo& ru iter.first.cache.second.reset(cpuRuntime->onCreate(&defaultConfig)); } } +void Session::ModeGroup::setMode(Interpreter::SessionMode mode) { + if (mode == Interpreter::Session_Input_Inside || mode == Interpreter::Session_Input_User) { + inputMode = mode; + } else if (mode == Interpreter::Session_Output_User || mode == Interpreter::Session_Output_Inside) { + outputMode = mode; + } else if (mode == Interpreter::Session_Backend_Auto || mode == Interpreter::Session_Backend_Fix) { + backendMode = mode; + } else if (mode == Interpreter::Session_Debug || mode == Interpreter::Session_Release) { + callBackMode = mode; + } else if (mode == Interpreter::Session_Resize_Direct || mode == Interpreter::Session_Resize_Defer) { + resizeMode = mode; + } else if(mode == Interpreter::Session_Memory_Collect || mode == Interpreter::Session_Memory_Cache) { + memoryUsageMode = mode; + } else if(mode == Interpreter::Session_Codegen_Disable || mode == Interpreter::Session_Codegen_Enable) { + codegenMode = mode; + } +} +void Session::ModeGroup::setHint(Interpreter::HintMode mode, int hint) { + switch (mode) { + case Interpreter::MAX_TUNING_NUMBER: + maxTuningNumber = hint; + break; + case Interpreter::MEM_ALLOCATOR_TYPE: + memoryAllocatorType = hint; + break; + case Interpreter::WINOGRAD_MEMORY_LEVEL: + winogradMemoryUsed = hint; + break; + case Interpreter::GEOMETRY_COMPUTE_MASK: + geometryMask = hint; + break; + case Interpreter::STRICT_CHECK_MODEL: + checkNetBuffer = hint > 0; + break; + default: + break; + } +} Session::Session(Schedule::ScheduleInfo&& info, const ModeGroup& mode, RuntimeInfo&& runtime) { mMode = mode; mRuntime = std::move(runtime); @@ -59,7 +97,7 @@ Session::Session(Schedule::ScheduleInfo&& info, const ModeGroup& mode, RuntimeIn attr.autoSetOpType = mode.backendMode == Interpreter::Session_Backend_Auto; auto rt = mRuntime.first.find(iter.first.info.type)->second.get(); auto cpuRuntime = mRuntime.second; - std::shared_ptr newPipeline(new Pipeline(mInfo.externalWeightPath, std::move(iter), mode.inputMode == Interpreter::Session_Input_Inside, mode.outputMode == Interpreter::Session_Output_User, attr, rt, cpuRuntime.get())); + std::shared_ptr newPipeline(new Pipeline( mInfo.externalWeightPath, std::move(iter), mode.inputMode == Interpreter::Session_Input_Inside, mode.outputMode == Interpreter::Session_Output_User, attr, rt, cpuRuntime.get(), mMode.geometryMask)); mPipelines.emplace_back(std::move(newPipeline)); } mCallBackMode = mode.callBackMode; diff --git a/source/core/Session.hpp b/source/core/Session.hpp index 7a1ba8963..7b3ac7caf 100644 --- a/source/core/Session.hpp +++ b/source/core/Session.hpp @@ -36,6 +36,10 @@ class MNN_PUBLIC Session { int memoryAllocatorType = 0; int maxTuningNumber = MNN_DEFAULT_TUNING_NUMBER; int winogradMemoryUsed = 3; + int geometryMask = 0xFFFF; + bool checkNetBuffer = true; + void setHint(Interpreter::HintMode hint, int magic); + void setMode(Interpreter::SessionMode mode); }; Session(Schedule::ScheduleInfo&& info, const ModeGroup& mode, RuntimeInfo&& runtime); diff --git a/source/core/TensorUtils.cpp b/source/core/TensorUtils.cpp index 3a0823a4a..587d0c170 100644 --- a/source/core/TensorUtils.cpp +++ b/source/core/TensorUtils.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include "core/Backend.hpp" #include "core/Macro.h" namespace MNN { @@ -403,21 +404,22 @@ bool TensorUtils::isDepthToSpaceRegions(const Tensor* output) { } // compute offset through region -static inline int offsetCompute(const Tensor::InsideDescribe::Region& reg, int offset, bool backward) { - Tensor::InsideDescribe::View src; - Tensor::InsideDescribe::View dst; +static inline int offsetCompute(const Tensor::InsideDescribe::Region& reg, int srcOffset, int dstOffset, bool backward) { + const Tensor::InsideDescribe::View* src; + const Tensor::InsideDescribe::View* dst; if (backward) { - src = reg.dst; - dst = reg.src; + src = ®.dst; + dst = ®.src; } else { - src = reg.src; - dst = reg.dst; + src = ®.src; + dst = ®.dst; } int res = 0; for (int i = 0; i < 3; i++) { if (reg.size[i] > 1) { - res += offset / src.stride[i] * dst.stride[i]; - offset %= src.stride[i]; + res += (srcOffset / src->stride[i] - dstOffset / src->stride[i]) * dst->stride[i]; + srcOffset %= src->stride[i]; + dstOffset %= src->stride[i]; } } return res; @@ -473,6 +475,75 @@ bool TensorUtils::refTensorContent(Tensor* dst, const Tensor* src) { return needMalloc; } +static bool _ClipDst(int* stride, int srcOffset, int dstOffset, const int* srcSize, const int* dstSize, const int sizeNum, int* dstMax, int* dstMin) { + /* Compute The range of dx, dy, dz: + s0 * (dx-sx) + s1 * (dy-sy) + s2 * (dz-sz) + (doff-soff) = 0 + Assume the region won't be overlapped, then extract doff -> s0*xd+ s1*yd+s2*zd, soff -> s0*xs+s1*ys+s2*zs + xd-xs=xo, yd-ys=yo, zd-zs=zo + then: + dx-sx+xo = 0 + dy-sy+yo = 0 + dz-sz+zo = 0 + dx=sx-xo -> [max(0, -xo), max(0, min(sxr-xo, dxr))] + dy,dz compute the same + **/ + + int offsetBias = dstOffset - srcOffset; + if (sizeNum == 0) { + // All stride is zero, then size will be all one + return offsetBias == 0; + } + int o[3] = {0, 0, 0}; + int validIndex[3] = {0, 1, 2}; + if (sizeNum == 2) { + if (stride[0] < stride[1]) { + validIndex[0] = 1; + validIndex[1] = 0; + } + } else if (sizeNum > 2) { + int maxs = stride[0]; + int mins = stride[0]; + int maxi = 0; + int mini = 0; + // Sort index by stride + for (int i=1; i maxs) { + maxs = s; + maxi = i; + } + if (s < mins) { + mins = s; + mini = i; + } + } + for (int i=0; i dstReg.src.offset || - srcReg.dst.stride[1] > srcReg.size[2] || - srcReg.dst.stride[2] > srcReg.size[1] * srcReg.size[2]) { - return false; +class TensorUtils::FuseRegionStatus { +public: + enum Status { + FUSE_SRC_COPY, + FUSE_DST_COPY, + FUSE_REGION_COMPUTE + }; + void apply(const Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg) { + switch (mStatus) { + case FUSE_SRC_COPY: + dstReg.origin = srcReg.origin; + dstReg.src.offset += srcReg.src.offset - srcReg.dst.offset; + break; + case FUSE_DST_COPY: + dstReg.origin = srcReg.origin; + dstReg.dst = srcReg.dst; + dstReg.src = srcReg.src; + dstReg.src.offset = mSrcOff; + dstReg.dst.offset = mDstOff; + dstReg.size[0] = srcReg.size[0]; + dstReg.size[1] = srcReg.size[1]; + dstReg.size[2] = srcReg.size[2]; + break; + case FUSE_REGION_COMPUTE: + { + if (dstSize[0] == 0) { + dstReg.size[0] = 0; + dstReg.origin = nullptr; + break; + } + for (int i=0; i<3; ++i) { + dstReg.size[i] = 1; + dstReg.src.stride[i] = 0; + dstReg.dst.stride[i] = 0; + } + int valid[3] = {0, 0, 0}; + int offset = 3 - dstNum; + if (dstNum > sizeNum) { + for (int i = 2; i >= 0; i--) { + if (i < dstNum) { + if (dstSize[i] == 1) { + expandIdx = i; + } + dstReg.size[i+offset] = dstMax[i] - dstMin[i]; + valid[i] = dstSize[i] > 1; + } else { + dstReg.size[i+offset] = 1; + valid[i] = 0; + } + } + } else { + for (int i=0; i 1 ? 1 : 0; + } + } + int idx = 0; + for (int i = 0; i < 3; i++) { + if (valid[i] > 0 || i == expandIdx) { + dstReg.src.stride[i+offset] = newSrc[idx]; + dstReg.dst.stride[i+offset] = dstDst[idx++]; + } + } + dstReg.origin = srcReg.origin; + dstReg.src.offset = newSrcOffset; + dstReg.dst.offset = newDstOffset; + } + break; + default: + break; + } } - int dstTotalSize = 1, srcTotalSize = 1; - for (int i = 0; i < 3; i++) { - if (dstReg.size[i] > 1) { - dstTotalSize *= dstReg.size[i]; + bool match(const Tensor::InsideDescribe::Region& srcReg, const Tensor::InsideDescribe::Region& dstReg) { + // dont deal size > 1 && stride <= 0 + for (int i = 0; i < 3; i++) { + if (srcReg.size[i] > 1 && (srcReg.src.stride[i] <= 0 || srcReg.dst.stride[i] <= 0)) { + return false; + } + if (dstReg.size[i] > 1 && (dstReg.src.stride[i] <= 0 || dstReg.dst.stride[i] <= 0)) { + return false; + } } - if (srcReg.size[i] > 1) { - srcTotalSize *= srcReg.size[i]; + bool copyValid = true; + // src data isnot full data of dst + if (srcReg.dst.offset > dstReg.src.offset || + srcReg.dst.stride[1] > srcReg.size[2] || + srcReg.dst.stride[2] > srcReg.size[1] * srcReg.size[2]) { + copyValid = false; + } + int dstTotalSize = 1, srcTotalSize = 1; + int dstSrcMin = dstReg.src.offset; + int dstSrcMax = dstSrcMin; + int srcDstMin = srcReg.dst.offset; + int srcDstMax = srcDstMin; + for (int i = 0; i < 3; i++) { + srcDstMax += srcReg.dst.stride[i] * (srcReg.size[i] - 1); + dstSrcMax += dstReg.src.stride[i] * (dstReg.size[i] - 1); + if (dstReg.size[i] > 1) { + dstTotalSize *= dstReg.size[i]; + } + if (srcReg.size[i] > 1) { + srcTotalSize *= srcReg.size[i]; + } } - } - // src data is not full data of dst - if (dstTotalSize > srcTotalSize) { - return false; - } - // dont deal size > 1 && stride <= 0 - for (int i = 0; i < 3; i++) { - if (srcReg.size[i] > 1 && (srcReg.src.stride[i] <= 0 || srcReg.dst.stride[i] <= 0)) { - return false; + // src data is not full data of dst + if (dstTotalSize > srcTotalSize) { + copyValid = false; } - if (dstReg.size[i] > 1 && (dstReg.src.stride[i] <= 0 || dstReg.dst.stride[i] <= 0)) { - return false; + // Valid range is from srcReg: srcDstMin - srcDstMax, if dst's srcReg exceed, not valid for copy + if (srcDstMin > dstSrcMin || srcDstMax < dstSrcMax) { + copyValid = false; } - } - // src copy fuse - if (isCopyRegion(srcReg)) { - dstReg.origin = srcReg.origin; - dstReg.src.offset += srcReg.src.offset - srcReg.dst.offset; - return true; - } - // dst copy fuse - if (isCopyRegion(dstReg) && dstTotalSize == srcTotalSize) { - int srcOff = dstReg.src.offset - srcReg.dst.offset; - int dstOff = dstReg.dst.offset; - srcOff = offsetCompute(srcReg, srcOff, true) + srcReg.src.offset; - if (srcReg.src.stride[2] > 0 && srcOff % srcReg.src.stride[2] != 0) { - // when transpose + slice, offset is not align can't fuse - return false; + // src copy fuse + if (isCopyRegion(srcReg) && copyValid) { + mStatus = FUSE_SRC_COPY; + return true; } - dstReg.origin = srcReg.origin; - dstReg.dst = srcReg.dst; - dstReg.src = srcReg.src; - dstReg.src.offset = srcOff; - dstReg.dst.offset = dstOff; - dstReg.size[0] = srcReg.size[0]; - dstReg.size[1] = srcReg.size[1]; - dstReg.size[2] = srcReg.size[2]; - return true; - } -#define MNN_FAST_FUSE_WITHOUT_STL -#ifdef MNN_FAST_FUSE_WITHOUT_STL - // general fuse - int srcDst[3], srcSrc[3], dstSrc[3], dstDst[3], srcSize[3], dstSize[3], newSrc[3], dstStride[3], srcStride[3]; -#define MNN_3_INT_INIT(x, y) { x[0] = y; x[1] = y; x[2] = y; } - MNN_3_INT_INIT(dstStride, -1) - MNN_3_INT_INIT(srcStride, -1) -#undef MNN_3_INT_INIT - int srcNum = 0, dstNum = 0, sizeNum = 0; - for (int i = 0; i < 3; i++) { - if (srcReg.size[i] > 1) { - srcStride[srcNum] = srcReg.dst.stride[i]; - srcDst[srcNum] = srcReg.dst.stride[i]; - srcSrc[srcNum] = srcReg.src.stride[i]; - srcSize[srcNum] = srcReg.size[i]; - srcNum++; - } - if (dstReg.size[i] > 1) { - dstStride[dstNum] = dstReg.src.stride[i]; - dstDst[dstNum] = dstReg.dst.stride[i]; - dstSrc[dstNum] = dstReg.src.stride[i]; - dstSize[dstNum] = dstReg.size[i]; - dstNum++; - } - } - sizeNum = dstNum; -#define MNN_3_INT_DIFF(r, x, y, i) if ((x[i] != y[0]) && (x[i] != y[1]) && (x[i] != y[2])) { if (r > 0) { return false; } else { r = x[i]; } } - int srcExtra = -1, dstExtra = -1; - MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 0) - MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 1) - MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 2) - MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 0) - MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 1) - MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 2) -#undef MNN_3_INT_DIFF - if (dstExtra > 0) { - if (!expandStrideSize(srcDst, srcSrc, srcSize, srcNum, dstExtra)) { - return false; + // dst copy fuse + if (isCopyRegion(dstReg) && dstTotalSize == srcTotalSize && copyValid) { + mSrcOff = dstReg.src.offset - srcReg.dst.offset; + mDstOff = dstReg.dst.offset; + mSrcOff = offsetCompute(srcReg, dstReg.src.offset, srcReg.dst.offset, true) + srcReg.src.offset; + if (!(srcReg.src.stride[2] > 0 && mSrcOff % srcReg.src.stride[2] != 0)) { + // when transpose + slice, offset is not align can't fuse + mStatus = FUSE_DST_COPY; + return true; + } } - } - if (srcExtra > 0) { - if (!expandStrideSize(dstSrc, dstDst, dstSize, dstNum, srcExtra)) { - return false; + #define MNN_3_INT_INIT(x, y) { x[0] = y; x[1] = y; x[2] = y; } + MNN_3_INT_INIT(dstStride, -1) + MNN_3_INT_INIT(srcStride, -1) + expandIdx = -1; + #undef MNN_3_INT_INIT + srcNum = 0, dstNum = 0, sizeNum = 0; + for (int i = 0; i < 3; i++) { + if (srcReg.size[i] > 1) { + srcStride[srcNum] = srcReg.dst.stride[i]; + srcDst[srcNum] = srcReg.dst.stride[i]; + srcSrc[srcNum] = srcReg.src.stride[i]; + srcSize[srcNum] = srcReg.size[i]; + srcNum++; + } + if (dstReg.size[i] > 1) { + dstStride[dstNum] = dstReg.src.stride[i]; + dstDst[dstNum] = dstReg.dst.stride[i]; + dstSrc[dstNum] = dstReg.src.stride[i]; + dstSize[dstNum] = dstReg.size[i]; + dstNum++; + } } - } - // reorder srcSrc to newSrc by align srcDst and dstSrc - for (int i = 0; i < dstNum; i++) { - int index = 0; - for (int j = 0; j < srcNum; j++) { - if (dstSrc[j] == srcDst[i]) { - index = j; + sizeNum = dstNum; + #define MNN_3_INT_DIFF(r, x, y, i) if ((x[i] != y[0]) && (x[i] != y[1]) && (x[i] != y[2])) { if (r > 0) { return false; } else { r = x[i]; } } + int srcExtra = -1, dstExtra = -1; + MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 0) + MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 1) + MNN_3_INT_DIFF(srcExtra, srcStride, dstStride, 2) + MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 0) + MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 1) + MNN_3_INT_DIFF(dstExtra, dstStride, srcStride, 2) + #undef MNN_3_INT_DIFF + if (dstExtra > 0) { + if (!expandStrideSize(srcDst, srcSrc, srcSize, srcNum, dstExtra)) { + return false; } } - newSrc[index] = srcSrc[i]; - } - // set final size and set expandIdx if expand val is 1 - int expandIdx = -1; - int newSrcOffset = offsetCompute(srcReg, dstReg.src.offset - srcReg.dst.offset, true) + srcReg.src.offset; - if (nullptr != srcReg.origin) { - bool valid = _RegionValid(newSrc, newSrcOffset, dstSize, dstNum, TensorUtils::getRawSize(srcReg.origin)); - if (!valid) { - // Exceed src range - return false; + if (srcExtra > 0) { + if (!expandStrideSize(dstSrc, dstDst, dstSize, dstNum, srcExtra)) { + return false; + } } - } - if (dstNum > sizeNum) { - for (int i = 2; i >= 0; i--) { - if (i < dstNum) { - if (dstSize[i] == 1) { - expandIdx = i; + // reorder srcSrc to newSrc by align srcDst and dstSrc + for (int i = 0; i < srcNum; i++) { + int index = -1; + for (int j = 0; j < dstNum; j++) { + if (dstSrc[j] == srcDst[i]) { + index = j; + break; } - dstReg.size[i] = dstSize[i]; - } else { - dstReg.size[i] = 1; } + if (-1 == index) { + return false; + } + newSrc[index] = srcSrc[i]; + newSrcSize[index] = srcSize[i]; } - } -#else - // general fuse - std::set dstStride, srcStride, dstDiff, srcDiff; - std::vector dstDst, dstSrc, srcDst, srcSrc, newSrc, dstSize, srcSize; - for (int i = 0; i < 3; i++) { - if (srcReg.size[i] > 1) { - srcStride.insert(srcReg.dst.stride[i]); - srcDst.push_back(srcReg.dst.stride[i]); - srcSrc.push_back(srcReg.src.stride[i]); - srcSize.push_back(srcReg.size[i]); - } - if (dstReg.size[i] > 1) { - dstStride.insert(dstReg.src.stride[i]); - dstDst.push_back(dstReg.dst.stride[i]); - dstSrc.push_back(dstReg.src.stride[i]); - dstSize.push_back(dstReg.size[i]); - } - } - int sizeNum = dstSize.size(); - std::set_difference(dstStride.begin(), dstStride.end(), srcStride.begin(), srcStride.end(), std::inserter(dstDiff, dstDiff.begin())); - std::set_difference(srcStride.begin(), srcStride.end(), dstStride.begin(), dstStride.end(), std::inserter(srcDiff, srcDiff.begin())); - if (dstDiff.size() > 1 || srcDiff.size() > 1) { - // many diff stride, now dont deal - return false; - } - // expand stride when middle tensor's stride diff - if (!dstDiff.empty()) { - if (!expandSrc(srcDst, srcSrc, srcSize, *dstDiff.begin())) { - return false; - } - } - if (!srcDiff.empty()) { - if (!expandSrc(dstSrc, dstDst, dstSize, *srcDiff.begin())) { + // set final size and set expandIdx if expand val is 1 + newSrcOffset = offsetCompute(srcReg, dstReg.src.offset, srcReg.dst.offset, true) + srcReg.src.offset; + bool valid = _ClipDst(dstSrc, srcReg.dst.offset, dstReg.src.offset, newSrcSize, dstSize, dstNum, dstMax, dstMin); + if (!valid) { return false; } - } - if (dstSize.size() > 3) { - // need splite region, dont deal - return false; - } - // reorder srcSrc to newSrc by align srcDst and dstSrc - newSrc.resize(srcSrc.size()); - for (int i = 0; i < dstSrc.size(); i++) { - int index = std::distance(dstSrc.begin(), std::find(dstSrc.begin(), dstSrc.end(), srcDst[i])); - newSrc[index] = srcSrc[i]; - } - // set final size and set expandIdx if expand val is 1 - int expandIdx = -1; - if (dstSize.size() > sizeNum) { - for (int i = 2; i >= 0; i--) { - if (i < dstSize.size()) { - if (dstSize[i] == 1) { - expandIdx = i; - } - dstReg.size[i] = dstSize[i]; - } else { - dstReg.size[i] = 1; + newDstOffset = dstReg.dst.offset; + for (int i=0; i 0) { + newDstOffset += dstMin[i] * dstDst[i]; + newSrcOffset += dstMin[i] * newSrc[i]; } } + mStatus = FUSE_REGION_COMPUTE; + return true; } -#endif - int idx = 0; - for (int i = 0; i < 3; i++) { - if (dstReg.size[i] > 1 || i == expandIdx) { - dstReg.src.stride[i] = newSrc[idx]; - dstReg.dst.stride[i] = dstDst[idx++]; - } - } - dstReg.origin = srcReg.origin; - dstReg.src.offset = newSrcOffset; - return true; +private: + int mStatus; + int mSrcOff; + int mDstOff; + // general fuse + int srcDst[3], srcSrc[3], dstSrc[3], dstDst[3], srcSize[3], dstSize[3], newSrc[3], dstStride[3], srcStride[3]; + int dstMin[3],dstMax[3]; + int newSrcSize[3]; + int srcNum, dstNum, sizeNum; + int newSrcOffset; + int newDstOffset; + int expandIdx; +}; + +TensorUtils::FuseWrap::FuseWrap() { + mStatus = new FuseRegionStatus; +} +TensorUtils::FuseWrap::~ FuseWrap() { + delete mStatus; +} +bool TensorUtils::FuseWrap::match(const Tensor::InsideDescribe::Region& srcReg, const Tensor::InsideDescribe::Region& dstReg) { + return mStatus->match(srcReg, dstReg); } +void TensorUtils::FuseWrap::apply(const Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg) { + mStatus->apply(srcReg, dstReg); +} + void TensorUtils::adjustTensorForCompability(Tensor* newTensor) { if (newTensor->dimensions() < 4) { for (int n = newTensor->dimensions(); n < 4; ++n) { diff --git a/source/core/TensorUtils.hpp b/source/core/TensorUtils.hpp index d8f8498ec..d98354597 100644 --- a/source/core/TensorUtils.hpp +++ b/source/core/TensorUtils.hpp @@ -187,7 +187,17 @@ class MNN_PUBLIC TensorUtils { static bool isTileRegion(const Tensor::InsideDescribe::Region& region); static bool isDepthToSpaceRegions(const Tensor* output); static bool reshapeSlice(Tensor::InsideDescribe::Region& slice, int outside, int inside, int axis); - static bool fuseRegion(Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg); + + class FuseRegionStatus; + class FuseWrap { + public: + FuseWrap(); + ~ FuseWrap(); + bool match(const Tensor::InsideDescribe::Region& srcReg, const Tensor::InsideDescribe::Region& dstReg); + void apply(const Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg); + private: + FuseRegionStatus* mStatus; + }; static void adjustTensorForCompability(Tensor* t); static Tensor::DimensionType getDimType(const Tensor* t); static std::vector getQuantInfo(const Tensor* t); diff --git a/source/geometry/GeometryBinary.cpp b/source/geometry/GeometryBinary.cpp index 81d8b0869..1e6610c11 100644 --- a/source/geometry/GeometryBinary.cpp +++ b/source/geometry/GeometryBinary.cpp @@ -122,93 +122,92 @@ class GeometryBinary : public GeometryComputer { input1Broadcast = true; } #ifdef MNN_BINARY_LOOP_OPT - if (input0Broadcast || input1Broadcast) { - if (inp0format == outFormat && inp1format == outFormat && outFormat != MNN_DATA_FORMAT_NC4HW4 && input0->getType().code == halide_type_float && op->main_as_BinaryOp()->activationType() == 0) { - if (!(input0Broadcast && input1Broadcast)) { -// if (false) { - // Use Loop instead of broadcast - std::shared_ptr newTensor(new Tensor); - TensorUtils::copyShape(output, newTensor.get(), true); - newTensor->buffer().type = output->buffer().type; - int srcIndex = 1; - int dstIndex = 2; - if (input0Broadcast) { - ConvertUtils::broadcastto(input0, newTensor.get()); - } else { - srcIndex = 2; - dstIndex = 1; - ConvertUtils::broadcastto(input1, newTensor.get()); - } - auto des = TensorUtils::getDescribe(newTensor.get()); - flatbuffers::FlatBufferBuilder builder; - BinaryOpBuilder binaryOpParamBuilder(builder); - binaryOpParamBuilder.add_opType(op->main_as_BinaryOp()->opType()); - auto binaryOpParamOffset = binaryOpParamBuilder.Finish(); - OpBuilder cmdOpBuilder(builder); - cmdOpBuilder.add_type(OpType_BinaryOp); - cmdOpBuilder.add_main(binaryOpParamOffset.Union()); - cmdOpBuilder.add_main_type(OpParameter_BinaryOp); - auto cmdOpOffset = cmdOpBuilder.Finish(); - auto iterIndexesOffset = builder.CreateVector(std::vector{-1, -1, -1}); - auto stepOffset = builder.CreateVector(std::vector{0, 0, 0}); - auto indexesOffset = builder.CreateVector(std::vector{2, 0, 1}); - std::vector> regionCommands; + // One input need broadcast, the other needn't + bool singleBroadCast = (!(input0Broadcast && input1Broadcast)) && (input0Broadcast || input1Broadcast); + bool forwardSupportLoop = inp0format == outFormat && inp1format == outFormat && outFormat != MNN_DATA_FORMAT_NC4HW4 && input0->getType().code == halide_type_float && op->main_as_BinaryOp()->activationType() == 0; + bool openLoop = context.support(Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_USELOOP); + if (singleBroadCast && forwardSupportLoop && openLoop) { + // Use Loop instead of broadcast + std::shared_ptr newTensor(new Tensor); + TensorUtils::copyShape(output, newTensor.get(), true); + newTensor->buffer().type = output->buffer().type; + int srcIndex = 1; + int dstIndex = 2; + if (input0Broadcast) { + ConvertUtils::broadcastto(input0, newTensor.get()); + } else { + srcIndex = 2; + dstIndex = 1; + ConvertUtils::broadcastto(input1, newTensor.get()); + } + auto des = TensorUtils::getDescribe(newTensor.get()); + flatbuffers::FlatBufferBuilder builder; + BinaryOpBuilder binaryOpParamBuilder(builder); + binaryOpParamBuilder.add_opType(op->main_as_BinaryOp()->opType()); + auto binaryOpParamOffset = binaryOpParamBuilder.Finish(); + OpBuilder cmdOpBuilder(builder); + cmdOpBuilder.add_type(OpType_BinaryOp); + cmdOpBuilder.add_main(binaryOpParamOffset.Union()); + cmdOpBuilder.add_main_type(OpParameter_BinaryOp); + auto cmdOpOffset = cmdOpBuilder.Finish(); + auto iterIndexesOffset = builder.CreateVector(std::vector{-1, -1, -1}); + auto stepOffset = builder.CreateVector(std::vector{0, 0, 0}); + auto indexesOffset = builder.CreateVector(std::vector{2, 0, 1}); + std::vector> regionCommands; - for (int i=0; iregions.size(); ++i) { - auto& reg = des->regions[i]; - auto sizeOffset = builder.CreateVector(reg.size, 3); - auto dstStride = builder.CreateVector(reg.dst.stride, 3); - auto srcStride = builder.CreateVector(reg.src.stride, 3); - std::vector> views(3); - { - ViewBuilder dstBuilder(builder); - dstBuilder.add_offset(reg.dst.offset); - dstBuilder.add_stride(dstStride); - views[0] = dstBuilder.Finish(); - views[dstIndex] = views[0]; - ViewBuilder srcBuilder(builder); - srcBuilder.add_offset(reg.src.offset); - srcBuilder.add_stride(srcStride); - views[srcIndex] = srcBuilder.Finish(); - } - auto viewsOffset = builder.CreateVector>(views); - RegionCommandBuilder cmdBuilder(builder); - cmdBuilder.add_op(cmdOpOffset); - cmdBuilder.add_view(viewsOffset); - cmdBuilder.add_size(sizeOffset); - cmdBuilder.add_steps(stepOffset); - cmdBuilder.add_iterIndexes(iterIndexesOffset); - cmdBuilder.add_indexes(indexesOffset); - - regionCommands.emplace_back(cmdBuilder.Finish()); - } - auto rcmdAllOffset = builder.CreateVector>(regionCommands); - auto inputIndexesOffset = builder.CreateVector(std::vector{0, 1}); - auto outputIndexesOffset = builder.CreateVector(std::vector{2}); - LoopParamBuilder loopBuilder(builder); - loopBuilder.add_commands(rcmdAllOffset); - loopBuilder.add_loopNumber(1); - loopBuilder.add_tensorNumber(3); - loopBuilder.add_inputIndexes(inputIndexesOffset); - loopBuilder.add_outputIndexes(outputIndexesOffset); - auto loopOffset = loopBuilder.Finish(); - flatbuffers::Offset nameOffset; - if (nullptr != op->name()) { - nameOffset = builder.CreateString(op->name()->c_str()); - } - OpBuilder finishBuilder(builder); - finishBuilder.add_main(loopOffset.Union()); - finishBuilder.add_main_type(OpParameter_LoopParam); - finishBuilder.add_type(OpType_While); - if (nullptr != op->name()) { - finishBuilder.add_name(nameOffset); - } - builder.Finish(finishBuilder.Finish()); - auto cmd = GeometryComputerUtils::makeCommand(builder, {input0, input1}, outputs); - res.command.emplace_back(std::move(cmd)); - return true; + for (int i=0; iregions.size(); ++i) { + auto& reg = des->regions[i]; + auto sizeOffset = builder.CreateVector(reg.size, 3); + auto dstStride = builder.CreateVector(reg.dst.stride, 3); + auto srcStride = builder.CreateVector(reg.src.stride, 3); + std::vector> views(3); + { + ViewBuilder dstBuilder(builder); + dstBuilder.add_offset(reg.dst.offset); + dstBuilder.add_stride(dstStride); + views[0] = dstBuilder.Finish(); + views[dstIndex] = views[0]; + ViewBuilder srcBuilder(builder); + srcBuilder.add_offset(reg.src.offset); + srcBuilder.add_stride(srcStride); + views[srcIndex] = srcBuilder.Finish(); } + auto viewsOffset = builder.CreateVector>(views); + RegionCommandBuilder cmdBuilder(builder); + cmdBuilder.add_op(cmdOpOffset); + cmdBuilder.add_view(viewsOffset); + cmdBuilder.add_size(sizeOffset); + cmdBuilder.add_steps(stepOffset); + cmdBuilder.add_iterIndexes(iterIndexesOffset); + cmdBuilder.add_indexes(indexesOffset); + + regionCommands.emplace_back(cmdBuilder.Finish()); + } + auto rcmdAllOffset = builder.CreateVector>(regionCommands); + auto inputIndexesOffset = builder.CreateVector(std::vector{0, 1}); + auto outputIndexesOffset = builder.CreateVector(std::vector{2}); + LoopParamBuilder loopBuilder(builder); + loopBuilder.add_commands(rcmdAllOffset); + loopBuilder.add_loopNumber(1); + loopBuilder.add_tensorNumber(3); + loopBuilder.add_inputIndexes(inputIndexesOffset); + loopBuilder.add_outputIndexes(outputIndexesOffset); + auto loopOffset = loopBuilder.Finish(); + flatbuffers::Offset nameOffset; + if (nullptr != op->name()) { + nameOffset = builder.CreateString(op->name()->c_str()); } + OpBuilder finishBuilder(builder); + finishBuilder.add_main(loopOffset.Union()); + finishBuilder.add_main_type(OpParameter_LoopParam); + finishBuilder.add_type(OpType_While); + if (nullptr != op->name()) { + finishBuilder.add_name(nameOffset); + } + builder.Finish(finishBuilder.Finish()); + auto cmd = GeometryComputerUtils::makeCommand(builder, {input0, input1}, outputs); + res.command.emplace_back(std::move(cmd)); + return true; } #endif if (input0Broadcast) { diff --git a/source/geometry/GeometryComputer.cpp b/source/geometry/GeometryComputer.cpp index 29a34cade..7c9887d30 100644 --- a/source/geometry/GeometryComputer.cpp +++ b/source/geometry/GeometryComputer.cpp @@ -7,6 +7,7 @@ // #include +#include #include "geometry/GeometryComputer.hpp" #include "core/Backend.hpp" #include "core/OpCommonUtils.hpp" @@ -18,7 +19,7 @@ namespace MNN { GeometryComputer::Context::~Context() { // Do nothing } -GeometryComputer::Context::Context(std::shared_ptr allocBackend, MNNForwardType type, BackendConfig::PrecisionMode precision) { +GeometryComputer::Context::Context(int mask, std::shared_ptr allocBackend, MNNForwardType type, BackendConfig::PrecisionMode precision) : mMask(mask) { mBackend = allocBackend; flatbuffers::FlatBufferBuilder builder(32); OpBuilder opBuilder(builder); @@ -287,20 +288,83 @@ void GeometryComputer::Context::getRasterCacheCreateRecursive(Tensor* src, Comma if (_hasZeroDim(src)) { return; } - for (auto& input : srcDes->regions) { - MNN_ASSERT(input.origin != src); - auto inputDes = TensorUtils::getDescribe(input.origin); - while (_virtualMemory(inputDes)) { - if (1 != inputDes->regions.size()) { + bool needDelete = false; + bool supportFuse = support(Interpreter::GEOMETRCOMPUTEMASK_FUSEREGION); + bool supportFuseMulti = support(Interpreter::GEOMETRCOMPUTEMASK_FUSEREGION_MULTI); + for (int regIndex = 0; regIndex < srcDes->regions.size();) { + auto input = srcDes->regions.data() + regIndex; + MNN_ASSERT(input->origin != src); + + auto inputDes = TensorUtils::getDescribe(input->origin); + while (_virtualMemory(inputDes) && supportFuse) { + if (0 == inputDes->regions.size()) { + // Empty Input, Remove the region by set size as 0 + input->size[0] = 0; + needDelete = true; break; } - bool merge = TensorUtils::fuseRegion(inputDes->regions[0], input); - if (!merge) { + if (1 < inputDes->regions.size()) { + if (!supportFuseMulti) { + break; + } + bool allCanMerge = true; + for (auto& reg : inputDes->regions) { + allCanMerge = allCanMerge && mFuseUtils.match(reg, *input); + if (!allCanMerge) { + break; + } + } + if (!allCanMerge) { + break; + } + Tensor::InsideDescribe::Region backup = *input; + mFuseUtils.match(inputDes->regions[0], *input); + mFuseUtils.apply(inputDes->regions[0], *input); + for (int i=1; iregions.size(); ++i) { + auto newReg = backup; + mFuseUtils.match(inputDes->regions[i], newReg); + mFuseUtils.apply(inputDes->regions[i], newReg); + if (newReg.size[0] == 0) { + continue; + } + srcDes->regions.emplace_back(newReg); + } + // After emplace_back, the input will change, reref it + input = srcDes->regions.data() + regIndex; + if (input->size[0] == 0) { + needDelete = true; + break; + } + inputDes = TensorUtils::getDescribe(input->origin); + continue; + } + bool merge = mFuseUtils.match(inputDes->regions[0], *input); + if (merge) { + mFuseUtils.apply(inputDes->regions[0], *input); + } else { break; } - inputDes = TensorUtils::getDescribe(input.origin); + if (input->size[0] == 0) { + needDelete = true; + break; + } + inputDes = TensorUtils::getDescribe(input->origin); + } + if (input->size[0] > 0) { + getRasterCacheCreateRecursive(input->origin, cmd); + } + ++regIndex; + } + if (needDelete) { + auto regions = std::move(srcDes->regions); + srcDes->regions.reserve(regions.size()); + for (int regIndex = 0; regIndex < regions.size(); ++regIndex) { + auto input = std::move(regions[regIndex]); + if (input.size[0] == 0 || input.size[1] == 0 || input.size[2] == 0) { + continue; + } + srcDes->regions.emplace_back(std::move(input)); } - getRasterCacheCreateRecursive(input.origin, cmd); } getRasterCacheCreate(src, cmd); } diff --git a/source/geometry/GeometryComputer.hpp b/source/geometry/GeometryComputer.hpp index a049be684..f826e4d99 100644 --- a/source/geometry/GeometryComputer.hpp +++ b/source/geometry/GeometryComputer.hpp @@ -23,7 +23,7 @@ class GeometryComputer { } class MNN_PUBLIC Context { public: - Context(std::shared_ptr allocBackend, MNNForwardType type = MNN_FORWARD_CPU, BackendConfig::PrecisionMode precision = BackendConfig::Precision_Normal); + Context(int mask, std::shared_ptr allocBackend, MNNForwardType type = MNN_FORWARD_CPU, BackendConfig::PrecisionMode precision = BackendConfig::Precision_Normal); ~Context(); void clear(); @@ -41,6 +41,9 @@ class GeometryComputer { inline BackendConfig::PrecisionMode precisionType() const { return mPrecision; } + inline bool support(int option) const { + return mMask & option; + } std::shared_ptr mRasterOp; private: void getRasterCacheCreate(Tensor* src, CommandBuffer& cmd); @@ -50,6 +53,8 @@ class GeometryComputer { std::shared_ptr mBackend; MNNForwardType mForwardType; BackendConfig::PrecisionMode mPrecision; + TensorUtils::FuseWrap mFuseUtils; + const int mMask; }; static void init(); MNN_PUBLIC static const GeometryComputer* search(int opType, Runtime::CompilerType compType); diff --git a/source/geometry/GeometryComputerUtils.cpp b/source/geometry/GeometryComputerUtils.cpp index 54db0c1d0..fc76622ab 100644 --- a/source/geometry/GeometryComputerUtils.cpp +++ b/source/geometry/GeometryComputerUtils.cpp @@ -147,8 +147,9 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform( Runtime::CompilerType compileType, bool skipShapeCompute, bool permitCodegen) { + bool openCache = geoContext.support(Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_OPENCACHE); /** Size Compute and compute Const Begin */ - GeometryComputer::Context ctx(backupBackend); + GeometryComputer::Context ctx(Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_ALL, backupBackend); // Size Compute and compute Const for (int i=0; itype(), Runtime::Compiler_Loop); { - auto res = geo->onRecompute(info.op, info.inputs, info.outputs, geoContext, tempBuffer); + bool res = false; + if (openCache) { + res = geo->onRecompute(info.op, info.inputs, info.outputs, geoContext, tempBuffer); + } if (!res) { tempBuffer.command.clear(); tempBuffer.extras.clear(); @@ -350,7 +354,7 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform( auto geo = GeometryComputer::search(info.op->type(), compileType); { bool res = false; - if (!tempBuffer.hasWrap) { + if ((!tempBuffer.hasWrap) && openCache) { res = geo->onRecompute(info.op, info.inputs, info.outputs, geoContext, tempBuffer); } if (!res) { diff --git a/test.sh b/test.sh index 0d0e90e1b..9d7c1b2d7 100755 --- a/test.sh +++ b/test.sh @@ -294,7 +294,7 @@ model_test() { echo '### 静态模型测试失败,测试终止!' failed fi - + if [ "$OPENCL_CHANGE" ]; then ../tools/script/modelTest.py ~/AliNNModel 3 0.002 1 if [ $? -ne 0 ]; then @@ -431,7 +431,7 @@ opencv_test() { llm_test() { # 1. build llm with low memory - cmake -DMNN_OPENCV_TEST=ON -DMNN_BUILD_LLM=ON .. + cmake -DMNN_LOW_MEMORY=ON -DMNN_BUILD_LLM=ON -DMNN_SUPPORT_TRANSFORMER_FUSE=ON .. make -j8 llm_build_wrong=$[$? > 0] printf "TEST_NAME_LLM_BUILD: LLM编译测试\nTEST_CASE_AMOUNT_LLM_BUILD: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n" \ @@ -441,7 +441,7 @@ llm_test() { failed fi # 2. run llm model test - ./llm_demo ~/AliNNModel/qwen-1.8b-int4 0 10 ~/AliNNModel/qwen-1.8b-int4/prompt.txt + ./llm_demo ~/AliNNModel/qwen1.5-0.5b-int4/config.json ~/AliNNModel/qwen1.5-0.5b-int4/prompt.txt if [ $? -gt 0 ]; then echo '### LLM模型测试失败,测试终止!' failed @@ -543,7 +543,7 @@ android_model_test() { fi fi done - + models=`ls ~/AliNNModel/TestResource/` for model in $models do @@ -562,7 +562,7 @@ android_model_test() { fi fi done - + models=`ls ~/AliNNModel/TestWithDescribe/` for model in $models do diff --git a/test/CommonOpCreator.hpp b/test/CommonOpCreator.hpp index 1097589cd..9efbb51a5 100644 --- a/test/CommonOpCreator.hpp +++ b/test/CommonOpCreator.hpp @@ -24,7 +24,7 @@ static PadMode _convertPadMode(Express::PaddingMode mode) { } return PadMode_CAFFE; } -static Express::VARP _HybridConv(const std::vector& weight, std::vector&& bias, std::vector&& alpha, Express::VARP x, std::vector channel, std::vector kernelSize, +static Express::VARP _HybridConv(const std::vector& weight, const std::vector& bias, const std::vector& alpha, Express::VARP x, std::vector channel, std::vector kernelSize, Express::PaddingMode pad, std::vector stride, std::vector dilate, int group, std::vector pads, bool relu, bool relu6, int nbits, bool async) { std::unique_ptr convOp(new OpT); convOp->type = OpType_Convolution; @@ -56,7 +56,7 @@ static Express::VARP _HybridConv(const std::vector& weight, std::vectorcommon->relu = relu; conv2D->weight.clear(); MNN_ASSERT(bias.size() == channel[1]); - conv2D->bias = std::move(bias); + conv2D->bias = bias; return (Express::Variable::create(Express::Expr::create(convOp.get(), {x}))); } diff --git a/test/MNNTestSuite.cpp b/test/MNNTestSuite.cpp index fa039b526..157a54051 100644 --- a/test/MNNTestSuite.cpp +++ b/test/MNNTestSuite.cpp @@ -6,9 +6,10 @@ // Copyright © 2018, Alibaba Group Holding Limited // -#include "MNNTestSuite.h" #include - +#include +#include +#include "MNNTestSuite.h" MNNTestSuite* MNNTestSuite::gInstance = NULL; MNNTestSuite* MNNTestSuite::get() { @@ -30,14 +31,14 @@ void MNNTestSuite::add(MNNTestCase* test, const char* name) { } static void printTestResult(int wrong, int right, const char* flag) { - printf("TEST_NAME_UNIT%s: 单元测试%s\nTEST_CASE_AMOUNT_UNIT%s: ", flag, flag, flag); - printf("{\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n", wrong, right); + MNN_PRINT("TEST_NAME_UNIT%s: 单元测试%s\nTEST_CASE_AMOUNT_UNIT%s: ", flag, flag, flag); + MNN_PRINT("{\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n", wrong, right); } int MNNTestSuite::run(const char* key, int precision, const char* flag) { if (key == NULL || strlen(key) == 0) return 0; - + std::map runTimes; auto suite = MNNTestSuite::get(); std::string prefix = key; std::vector wrongs; @@ -46,26 +47,32 @@ int MNNTestSuite::run(const char* key, int precision, const char* flag) { MNNTestCase* test = suite->mTests[i]; if (test->name.find(prefix) == 0) { runUnit++; - printf("\trunning %s.\n", test->name.c_str()); + MNN_PRINT("\trunning %s.\n", test->name.c_str()); + MNN::Timer _t; auto res = test->run(precision); + runTimes.insert(std::make_pair(test->name, _t.durationInUs() / 1000.0f)); if (!res) { wrongs.emplace_back(test->name); } } } if (wrongs.empty()) { - printf("√√√ all <%s> tests passed.\n", key); + MNN_PRINT("√√√ all <%s> tests passed.\n", key); } for (auto& wrong : wrongs) { - printf("Error: %s\n", wrong.c_str()); + MNN_PRINT("Error: %s\n", wrong.c_str()); } printTestResult(wrongs.size(), runUnit - wrongs.size(), flag); + for (auto& iter : runTimes) { + MNN_PRINT("%s cost time: %.3f ms\n", iter.first.c_str(), iter.second); + } return wrongs.size(); } int MNNTestSuite::runAll(int precision, const char* flag) { auto suite = MNNTestSuite::get(); std::vector wrongs; + std::map runTimes; for (int i = 0; i < suite->mTests.size(); ++i) { MNNTestCase* test = suite->mTests[i]; if (test->name.find("speed") != std::string::npos) { @@ -76,18 +83,23 @@ int MNNTestSuite::runAll(int precision, const char* flag) { // Don't test for model because need resource continue; } - printf("\trunning %s.\n", test->name.c_str()); + MNN_PRINT("\trunning %s.\n", test->name.c_str()); + MNN::Timer _t; auto res = test->run(precision); + runTimes.insert(std::make_pair(test->name, _t.durationInUs() / 1000.0f)); if (!res) { wrongs.emplace_back(test->name); } } if (wrongs.empty()) { - printf("√√√ all tests passed.\n"); + MNN_PRINT("√√√ all tests passed.\n"); } for (auto& wrong : wrongs) { - printf("Error: %s\n", wrong.c_str()); + MNN_PRINT("Error: %s\n", wrong.c_str()); } printTestResult(wrongs.size(), suite->mTests.size() - wrongs.size(), flag); + for (auto& iter : runTimes) { + MNN_PRINT("%s cost time: %.3f ms\n", iter.first.c_str(), iter.second); + } return wrongs.size(); } diff --git a/test/core/RegionFuse.cpp b/test/core/RegionFuse.cpp index 9cabc388f..2820dd09b 100644 --- a/test/core/RegionFuse.cpp +++ b/test/core/RegionFuse.cpp @@ -26,16 +26,16 @@ class RegionFuseTest : public MNNTestCase { {0, 1, 16, 1, 0, 1, 16, 1, 1, 4, 16}, // transpose + memcpy = transpose: [1, 4, 16] => [1, 16, 4] => [16, 1, 4] {0, 1, 1, 16, 0, 1, 4, 1, 1, 16, 4}, - {0, 1, 1, 1, 0, 1, 1, 1, 16, 1, 4}, + {0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 64}, {0, 1, 1, 16, 0, 1, 4, 1, 1, 16, 4}, // transpose + transpose' = transpose'': [3, 4, 5] => [5, 3, 4] => [4, 5, 3] {0, 1, 1, 5, 0, 1, 12, 1, 1, 5, 12}, {0, 1, 1, 4, 0, 1, 15, 1, 1, 4, 15}, {0, 5, 1, 20, 0, 15, 3, 1, 4, 5, 3}, - // memcpy + memcpy' = memcpy'': offset:2 => offset:3 => offser:6+2-3=5 + // memcpy + memcpy' = memcpy'': offset:2 => offset:3 => offser:6+2-3=5, clip: range: 3-19 & 6-22 = 6-19, size=13 {2, 1, 1, 1, 3, 1, 1, 1, 1, 1, 16}, {6, 1, 1, 1, 0, 1, 1, 1, 1, 1, 16}, - {5, 1, 1, 1, 0, 1, 1, 1, 1, 1, 16}, + {5, 1, 1, 1, 0, 1, 1, 1, 1, 1, 13}, // transpose + slice (offset align) => [3, 3, 4] => [3, 4, 3] => [2, 4, 3] {0, 12, 1, 4, 0, 12, 3, 1, 3, 4, 3}, {12, 36, 3, 1, 0, 24, 3, 1, 1, 8, 3}, @@ -44,10 +44,10 @@ class RegionFuseTest : public MNNTestCase { {0, 12, 1, 4, 0, 12, 3, 1, 3, 4, 3}, {18, 36, 3, 1, 0, 18, 3, 1, 1, 6, 3}, {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, - // copy + expand (src < dst) => [34491] => [34645] => [34645, 2] + // copy + expand (src < dst) => [34491] => [34645] => [34645, 2] , clip [34491, 34645] -> [34491, 2] {0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 34491}, {0, 1, 1, 1, 0, 2, 1, 1, 34645, 1, 1}, - {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {0, 1, 1, 1, 0, 1, 1, 2, 1, 1, 34491}, // transpose + slice: [3, 256, 940] => [3, 940, 256] => [1, 256, 940] (expand_val = 1) {0, 240640, 1, 940, 0, 240640, 256, 1, 3, 940, 256}, {0, 1, 256, 1, 0, 1, 768, 1, 1, 940, 256}, @@ -60,26 +60,54 @@ class RegionFuseTest : public MNNTestCase { {0, 1600, 1, 4, 0, 1600, 400, 1, 53, 4, 400}, {0, 400, 20, 1, 0, 400, 20, 1, 190, 20, 20}, {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, - // pad + transpose + slice + transpose (not full copy) + // pad + transpose + slice + transpose (not full copy) {0, 12321, 111, 1, 0, 12544, 112, 1, 32, 111, 111}, {113, 12544, 112, 1, 0, 12321, 111, 1, 32, 111, 111}, - {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1} + {112, 12321, 111, 1, 0, 12321, 111, 1, 32, 110, 110} }; + TensorUtils::FuseWrap fuseUtils; for (int i = 0; i < N; i++) { Region src, dst; src.origin = nullptr; dst.origin = nullptr; ::memcpy(&src, data[3 * i], 44); ::memcpy(&dst, data[3 * i + 1], 44); - bool fused = TensorUtils::fuseRegion(src, dst); + bool fused = fuseUtils.match(src, dst); + Region newDst = dst; + if (fused) { + fuseUtils.apply(src, newDst); + } if (data[3 * i + 2][0] < 0 && !fused) { continue; } - int cmp = ::memcmp(&dst, data[3 * i + 2], 44); - if (!fused || (cmp != 0)) { + if (!fused) { + MNN_ERROR("regionfuse %d test failed for fuse!\n", i); + return false; + } + Region target; + ::memcpy(&target, data[3 * i + 2], 44); + if (target.src.offset != newDst.src.offset || target.dst.offset != newDst.dst.offset) { MNN_ERROR("regionfuse %d test failed!\n", i); return false; } + int cmp = ::memcmp(&newDst.size, target.size, 3 * sizeof(int)); + if (cmp != 0) { + MNN_ERROR("regionfuse %d test size not match\n", i); + return false; + } + for (int u=0; u<3; ++u) { + if (newDst.size[u] == 1) { + continue; + } + if (newDst.src.stride[u] != target.src.stride[u]) { + MNN_ERROR("regionfuse %d test src stride not match\n", i); + return false; + } + if (newDst.dst.stride[u] != target.dst.stride[u]) { + MNN_ERROR("regionfuse %d test dst stride not match\n", i); + return false; + } + } } return true; } diff --git a/test/expr/MatMulTest.cpp b/test/expr/MatMulTest.cpp index e768051ba..5bcc97285 100644 --- a/test/expr/MatMulTest.cpp +++ b/test/expr/MatMulTest.cpp @@ -49,6 +49,20 @@ static bool checkMatMul(const float* C, const float* A, const float* B, int e, i return res; } +static void _originMatMul(float* C, const float* A, const float* B, int e, int l, int h) { + for (int y = 0; y < e; ++y) { + auto AY = A + l * y; + auto CY = C + h * y; + for (int x = 0; x < h; ++x) { + auto BX = B + x; + float expected = 0.0f; + for (int k = 0; k < l; ++k) { + expected += AY[k] * BX[k * h]; + } + CY[x] = expected; + } + } +} class MatMulTest : public MNNTestCase { public: virtual bool run(int precision) { @@ -288,6 +302,45 @@ class MatMulTest : public MNNTestCase { } } } + { + int e = 23; + int l = 33; + int h = 9; + { + // Test MatMul + std::unique_ptr op(new MNN::OpT); + op->type = MNN::OpType_MatMul; + op->main.type = MNN::OpParameter_MatMul; + op->main.value = new MNN::MatMulT; + auto matmulParam = op->main.AsMatMul(); + matmulParam->transposeA = false; + matmulParam->transposeB = false; + + auto x0 = _Input({}, NHWC, halide_type_of()); + auto x1 = _Input({}, NHWC, halide_type_of()); + x0->resize({e, l}); + x1->resize({l, h}); + auto y = Variable::create(Expr::create(op.get(), {x0, x1})); + Variable::prepareCompute({y}); + auto dstY = _Input({e, h}, NHWC, halide_type_of()); + fillFloat(x0->writeMap(), e, l); + fillFloat(x1->writeMap(), l, h); + _originMatMul(dstY->writeMap(), x0->readMap(), x1->readMap(), e, l, h); + + auto absMaxV = _ReduceMax(_Abs(dstY)); + auto diffV = _ReduceMax(_Abs(dstY - y)); + Variable::prepareCompute({absMaxV, diffV}, true); + + auto absMax = absMaxV->readMap()[0]; + MNN_ASSERT(absMax != 0.0f); + auto diff = diffV->readMap()[0]; + + if (diff > 0.01f * absMax) { + MNN_PRINT("%f error larger than %f * 0.001f\n", diff, absMax); + return false; + } + } + } return true; } }; diff --git a/test/op/ConvInt8Test.cpp b/test/op/ConvInt8Test.cpp index 68d25ad65..31f716046 100644 --- a/test/op/ConvInt8Test.cpp +++ b/test/op/ConvInt8Test.cpp @@ -293,7 +293,7 @@ class ConvInt8Im2colGemmTest : public ConvInt8TestCommon { std::vector> kernels = { {4, 2}, {1, 5}, {7, 1} }; - int iw = 34; int ih = 23; + int iw = 24; int ih = 17; std::vector titles = {"4x2", "1x5", "7x1"}; for (int sx=1; sx<2; ++sx) { for (int sy=1; sy<2; ++sy) { diff --git a/test/op/ConvolutionTest.cpp b/test/op/ConvolutionTest.cpp index f8245b0fe..836ace993 100644 --- a/test/op/ConvolutionTest.cpp +++ b/test/op/ConvolutionTest.cpp @@ -523,15 +523,10 @@ class ConvolutionInt8CommonTest : public ConvolutionCommonTest { virtual void generateWeight(std::vector& weightData, int ic, int oc, int kh, int kw, int dilation, int group, int sparseBlockOC) { auto numbers = group * (oc / group) * (ic / group) * kw * kh; weightData.resize(numbers); - float rate = 1.0f; - if (numbers > 10000) { - // Avoid exceed fp16 - rate = 0.01f; - } + float rate = 1.0f / numbers; for (int ri = 0; ri < numbers; ri++) { - int i = numbers - ri; - auto data = ((((i / kw)% 1317) * ((i / kh) % 1317)) % 1317 + i / ic + i / oc + (((oc - i) % 1317) * ic) % 1317 + i * ((oc - i) % 1317)) % 1317; - auto floatData = (float)(data % 255) / 255.0f / 1000.0f * rate; + int data = ri - numbers / 2; + auto floatData = (float)(data) * rate; weightData[ri] = data; } } @@ -629,28 +624,6 @@ class ConvolutionInt8CommonTest : public ConvolutionCommonTest { ::memcpy(input->writeMap(), inputData.data(), inputData.size() * sizeof(float)); // Single Conv auto weightLength = weightData.size(); - auto output = _HybridConv(weightData, std::move(biasData), std::move(wScale), input, - {ic, oc}, {kw, kh}, padMap[mode], {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false, nbit, async); - - // difference below 0.5% relative error is considered correct. - auto outputPtr = output->readMap(); - - if (debug) { - MNN_PRINT("\ndata NCHW shape:"); - printDims(input->getInfo()->dim); - MNN_PRINT("\nweight OIHW shape:"); - printDims({oc, ic, kh, kw}); - MNN_PRINT("\noutput NCHW shape:"); - printDims(output->getInfo()->dim); - MNN_PRINT("\nexpected output:"); - formatMatrix(outputData.data(), output->getInfo()->dim); - MNN_PRINT("\nexpected output 2:"); - formatMatrix(outputDataSeparateBias.data(), output->getInfo()->dim); - MNN_PRINT("\nreal output:"); - formatMatrix(outputPtr, output->getInfo()->dim); - } - // when using low precision, im2col or strassen convolution error rate to reference value is about 1e-4, winograd has larger error rate. - float errorScale = 1.0f; if (nbit == 4 && weightLength > 10000) { errorScale = 50.0f; @@ -658,29 +631,45 @@ class ConvolutionInt8CommonTest : public ConvolutionCommonTest { if (precision > MNN::BackendConfig::Precision_High) { errorScale = 100.0f; } - if (!checkVectorByRelativeError(outputPtr, outputData.data(), outputDataSeparateBias.data(), outputData.size(), 0.001 * errorScale)) { - MNN_PRINT("precision:%d, expect:\t expect2:\t real:\t\n", precision); - for (int i = 0; i < outputData.size(); ++i) - { - MNN_PRINT("%f\t, %f\t, %f\n", outputData[i],outputDataSeparateBias[i], outputPtr[i]); + std::vector> activations = { + {false, false}, + {true, false}, + {false, true} + }; + for (auto& activation : activations) { + auto output = _HybridConv(weightData, biasData, wScale, input, + {ic, oc}, {kw, kh}, padMap[mode], {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, activation.first, activation.second, nbit, async); + auto toutputData = outputData; + float maxV = -10000.0f; + float minV = 10000.0f; + if (activation.first) { + for (auto& t : toutputData) { + maxV = ALIMAX(maxV, t); + minV = ALIMIN(minV, t); + t = ALIMAX(0.0f, t); + } +// MNN_PRINT("Max: %f -> Min:%f\n", maxV, minV); + } + if (activation.second) { + for (auto& t : toutputData) { + t = ALIMAX(0.0f, t); + t = ALIMIN(6.0f, t); + } } - MNN_ERROR("%s(%s) test failed for %d bits, async=%d !\n", test_op_name.c_str(), device_name.c_str(), nbit, async); - return false; - } + // difference below 0.5% relative error is considered correct. + auto outputPtr = output->readMap(); + // when using low precision, im2col or strassen convolution error rate to reference value is about 1e-4, winograd has larger error rate. - if (mBenchSpeed) { - int oh = output->getInfo()->dim[2], ow = output->getInfo()->dim[3]; - input.fix(VARP::INPUT); - MNN::Timer _t; - const int LOOP = 20; - for (int i = 0; i < LOOP; ++i) { - input->writeMap(); - output->readMap(); + if (!checkVectorByRelativeError(outputPtr, toutputData.data(), toutputData.data(), toutputData.size(), 0.001 * errorScale)) { + MNN_PRINT("precision:%d, expect:\t expect2:\t real:\t\n", precision); + for (int i = 0; i < toutputData.size(); ++i) + { + MNN_PRINT("%f\t, %f\t, %f\n", toutputData[i],outputDataSeparateBias[i], outputPtr[i]); + } + MNN_ERROR("%s(%s) test failed for %d bits, async=%d , relu: %d, relu6: %d!\n", test_op_name.c_str(), device_name.c_str(), nbit, async, activation.first, activation.second); + return false; } - auto time = (float)_t.durationInUs() / 1000.0f; - MNN_PRINT("ConvInt8Weight kernel=(%dx%d) input=(1x%dx%dx%d) output=(1x%dx%dx%d) stride=(%dx%d), avg time = %f\n", - kh, kw, ic, ih, iw, oc, oh, ow, stride, stride, 1.0 * time / LOOP); } return true; } @@ -743,13 +732,24 @@ class ConvolutionTest : public ConvolutionType { virtual ~ConvolutionTest() = default; protected: - static bool test(MNNForwardType type, const std::string& device_name, int precision, MNN::SparseAlgo sparseAlgo, std::vector blocks) { - + static bool test(MNNForwardType type, const std::string& device_name, int precision, MNN::SparseAlgo sparseAlgo, std::vector blocks, bool checkSpectial = false) { + int ocStep = 1; + int icStep = 1; + int isStep = 3; + std::vector ocSize = { + 1, 3, 10, 17 + }; + std::vector icSize = { + 1, 3, 10, 17 + }; + std::vector isSize = { + 1, 7, 9 + }; for (int b = 1; b <= 2; b++) { - for (int oc = 1; oc <= 17; oc += 4) { - for (int ic = 1; ic <= 18; ic += 5) { - for (int is = 1; is <= 17; is += 3) { + for (auto oc : ocSize) { + for (auto ic : icSize) { + for (auto is : isSize) { for (int kw = 1; kw <= 3 && kw <= is; kw+=2) { for (int kh = 1; kh <= 3 && kh <= is; kh+=3) { for (int d = 1; d <= 2; d++) { @@ -806,6 +806,9 @@ class ConvolutionTest : public ConvolutionType { } } } + if (!checkSpectial) { + return true; + } // Check Long convolution bool succ = ConvolutionType().test(type, device_name, "Conv2D", 1, 256, 256, 24, 24, PadMode_SAME, 0, 0, 3, 3, 1, 1, 1, precision, sparseAlgo, 4, false); @@ -844,7 +847,7 @@ class ConvolutionTestOnCPU : public DenseConvolutionTest { public: ~ConvolutionTestOnCPU() = default; virtual bool run(int precision) { - return DenseConvolutionTest::test(MNN_FORWARD_CPU, "CPU", precision, MNN::SparseAlgo_RANDOM, {1}); + return DenseConvolutionTest::test(MNN_FORWARD_CPU, "CPU", precision, MNN::SparseAlgo_RANDOM, {1}, true); } }; diff --git a/test/op/ZerosLikeTest.cpp b/test/op/ZerosLikeTest.cpp index fe383735d..4cae5e023 100644 --- a/test/op/ZerosLikeTest.cpp +++ b/test/op/ZerosLikeTest.cpp @@ -31,6 +31,17 @@ class ZerosLikeTest : public MNNTestCase { MNN_ERROR("ZerosLikeTest test failed!\n"); return false; } + output = _ZerosLike(input); + auto o2 = _Stack({output, output}); + auto o2ptr = o2->readMap(); + if (!checkVector(o2ptr, expectedOutput.data(), 16, 0.01)) { + MNN_ERROR("ZerosLikeTest test concat0 failed!\n"); + return false; + } + if (!checkVector(o2ptr + 16, expectedOutput.data(), 16, 0.01)) { + MNN_ERROR("ZerosLikeTest test concat1 failed!\n"); + return false; + } return true; } virtual bool run(int precision) { diff --git a/tools/converter/include/config.hpp b/tools/converter/include/config.hpp index b5ada4f79..befc9d51f 100644 --- a/tools/converter/include/config.hpp +++ b/tools/converter/include/config.hpp @@ -39,6 +39,7 @@ class MNN_PUBLIC modelConfig { bool forTraining = false; int weightQuantBits = 0;// If weightQuantBits > 0, it means the bit bool weightQuantAsymmetric = true; + int weightQuantBlock = -1; // The path of the model compression file that stores the int8 calibration table // or sparse parameters. std::string compressionParamsFile = ""; diff --git a/tools/converter/source/common/WeightQuantAndCoding.cpp b/tools/converter/source/common/WeightQuantAndCoding.cpp index 3fd718dfb..81526ff9f 100644 --- a/tools/converter/source/common/WeightQuantAndCoding.cpp +++ b/tools/converter/source/common/WeightQuantAndCoding.cpp @@ -82,6 +82,8 @@ void WeightQuantAndCoding(std::unique_ptr& op, const modelConfig& conf } int kernelNum = common->outputCount; int kernelSize = weightSize / kernelNum; + int kxky = common->kernelX * common->kernelY; + int icCount = kernelSize / kxky; bool asymmetricQuantFlag = config.weightQuantAsymmetric; @@ -91,7 +93,12 @@ void WeightQuantAndCoding(std::unique_ptr& op, const modelConfig& conf clampMin = -threshold - 1; } std::vector weightData, scales; - std::vector quantWeights; + // block-wise quant + int block_size = kernelSize, block_num = 1; + if (config.weightQuantBlock > 0 && (kernelSize % config.weightQuantBlock == 0) && kxky == 1) { + block_size = config.weightQuantBlock; + block_num = kernelSize / block_size; + } switch (opType) { case MNN::OpType_Convolution: @@ -99,41 +106,32 @@ void WeightQuantAndCoding(std::unique_ptr& op, const modelConfig& conf case MNN::OpType_Deconvolution: case MNN::OpType_DeconvolutionDepthwise: { weightData = std::move(param->weight); - if (asymmetricQuantFlag) { - scales.resize(kernelNum*2); + scales.resize(kernelNum * block_num * 2); for (int k = 0; k < kernelNum; k++) { - int beginIndex = k * kernelSize; - auto minAndMax = findMinMax(weightData.data() + beginIndex, kernelSize); - float min = minAndMax[0]; - float max = minAndMax[1]; - float scale = (max - min) / (threshold - clampMin); - - scales[2*k] = min; - scales[2*k+1] = scale; - - for (int ii = 0; ii < kernelSize; ii++) { - float* ptr = weightData.data() + beginIndex; - int8_t quantValue = int8_t(std::round((ptr[ii] - min) / scale + clampMin)); - quantWeights.emplace_back(quantValue); + for (int b = 0; b < block_num; b++) { + int beginIndex = k * kernelSize + b * block_size; + auto minAndMax = findMinMax(weightData.data() + beginIndex, block_size); + float min = minAndMax[0]; + float max = minAndMax[1]; + float scale = (max - min) / (threshold - clampMin); + + int scaleIndex = k * block_num + b; + scales[2 * scaleIndex] = min; + scales[2 * scaleIndex + 1] = scale; } } } else { - scales.resize(kernelNum); + scales.resize(kernelNum * block_num); for (int k = 0; k < kernelNum; k++) { - int beginIndex = k * kernelSize; - auto absMax = findAbsMax(weightData.data() + beginIndex, kernelSize); - - scales[k] = absMax / threshold; - - for (int ii = 0; ii < kernelSize; ii++) { - float* ptr = weightData.data() + beginIndex; - int8_t quantValue = int8_t(std::round(ptr[ii] / scales[k])); - quantWeights.emplace_back(quantValue); + for (int b = 0; b < block_num; b++) { + int beginIndex = k * kernelSize + b * block_size; + auto absMax = findAbsMax(weightData.data() + beginIndex, block_size); + int scaleIndex = k * block_num + b; + scales[scaleIndex] = absMax / threshold; } } } - break; } case MNN::OpType_ConvInt8: @@ -150,12 +148,14 @@ void WeightQuantAndCoding(std::unique_ptr& op, const modelConfig& conf break; } + kernelSize = block_size; + kernelNum = kernelNum * block_num; if (opType == MNN::OpType_ConvInt8 || opType == MNN::OpType_DepthwiseConvInt8) { param->quanParameter = IDSTEncoder::encode(weightData.data(), scales, kernelSize, kernelNum, false, param->symmetricQuan->weight.data(), int(clampMin), bits); param->symmetricQuan->weight.clear(); param->quanParameter->alpha = {1.0f}; // fake scales } else { - param->quanParameter = IDSTEncoder::encode(weightData.data(), scales, kernelSize, kernelNum, asymmetricQuantFlag, quantWeights.data(), int(clampMin), bits, config.detectSparseSpeedUp); + param->quanParameter = IDSTEncoder::encode(weightData.data(), scales, kernelSize, kernelNum, asymmetricQuantFlag, nullptr, int(clampMin), bits, config.detectSparseSpeedUp); param->weight.clear(); std::vector empty; param->weight.swap(empty); diff --git a/tools/converter/source/common/cli.cpp b/tools/converter/source/common/cli.cpp index c2a64a8ee..ffe8c8ae9 100644 --- a/tools/converter/source/common/cli.cpp +++ b/tools/converter/source/common/cli.cpp @@ -205,6 +205,11 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv "but asymmetric quant model cannot run on old MNN versions. You will need to upgrade MNN to new version to solve this problem. default: false", cxxopts::value() ) + ( + "weightQuantBlock", + "using block-wise weight quant, set block size, defaut: -1, which means channel-wise weight quant", + cxxopts::value() + ) ( "compressionParamsFile", "The path of the compression parameters that stores activation, " @@ -437,7 +442,10 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv modelPath.weightQuantBits = result["weightQuantBits"].as(); } if (result.count("weightQuantAsymmetric")) { - modelPath.weightQuantAsymmetric = true; + modelPath.weightQuantAsymmetric = result["weightQuantAsymmetric"].as(); + } + if (result.count("weightQuantBlock")) { + modelPath.weightQuantBlock = result["weightQuantBlock"].as(); } if (result.count("saveStaticModel")) { modelPath.saveStaticModel = true; diff --git a/tools/converter/source/common/convertToStaticModel.cpp b/tools/converter/source/common/convertToStaticModel.cpp index bfce390cf..2d8c120db 100644 --- a/tools/converter/source/common/convertToStaticModel.cpp +++ b/tools/converter/source/common/convertToStaticModel.cpp @@ -332,7 +332,7 @@ void converToStaticModel(const Net* net, std::map>& } std::vector infos; initPipelineInfosFromNet(infos, net, allTensors); - GeometryComputer::Context ctx(defaultBackend); + GeometryComputer::Context ctx(Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_ALL, defaultBackend); // resize the session's info and store to buffer std::vector constTensors; GeometryComputerUtils::buildConstantTensors(infos); diff --git a/tools/converter/source/optimizer/merge/FuseAttention.cpp b/tools/converter/source/optimizer/merge/FuseAttention.cpp index c0b69061c..8c2058855 100644 --- a/tools/converter/source/optimizer/merge/FuseAttention.cpp +++ b/tools/converter/source/optimizer/merge/FuseAttention.cpp @@ -22,6 +22,22 @@ class FuseAttention { VARP query, key, value, mask; }; +static EXPRP is_gqa(EXPRP& x) { + if (!helpers::IsReshape(x)) { + return x; + } + auto y = x->inputs().at(0)->expr().first; + if (!helpers::IsBroadcastTo(y)) { + return x; + } + y = y->inputs().at(0)->expr().first; + if (!helpers::IsUnsqueeze(y)) { + return x; + } + y = y->inputs().at(0)->expr().first; + return y; +} + FuseAttention::FuseAttention() { auto match = [this](EXPRP expr) -> bool { auto config = Global::Get(); @@ -44,9 +60,9 @@ FuseAttention::FuseAttention() { if (!helpers::IsMatMul(matmul)) { return false; } - - // transpose y = matmul->inputs().at(1)->expr().first; + y = is_gqa(y); + // transpose if (!helpers::IsTranspose(y)) { return false; } @@ -98,8 +114,9 @@ FuseAttention::FuseAttention() { // query query = z->inputs().at(0); - // transpose y = x->inputs().at(1)->expr().first; + // transpose + y = is_gqa(y); if (!helpers::IsTranspose(y)) { return false; } @@ -120,6 +137,9 @@ FuseAttention::FuseAttention() { // For target version < 2.8 , don't support fmha_v2 return false; } + if (expr->name().size() > 0) { + MNN_PRINT("Fuse Attention as %s\n", expr->name().c_str()); + } std::unique_ptr attention(new OpT); attention->name = "Attention" + expr->name(); @@ -206,6 +226,9 @@ RemovePastKeyValue::RemovePastKeyValue() { // For target version < 2.8 , don't support fmha_v2 return false; } + if (!expr->name().empty()) { + MNN_PRINT("Remove past KV for %s\n", expr->name().c_str()); + } // past-kv remove std::unique_ptr reshape(new OpT); diff --git a/tools/converter/source/optimizer/merge/MergeHelpers.cpp b/tools/converter/source/optimizer/merge/MergeHelpers.cpp index b49201c0e..4f4401226 100644 --- a/tools/converter/source/optimizer/merge/MergeHelpers.cpp +++ b/tools/converter/source/optimizer/merge/MergeHelpers.cpp @@ -167,6 +167,11 @@ bool IsExpandDims(EXPRP expr) { return op && op->type() == OpType_ExpandDims; } +bool IsBroadcastTo(EXPRP expr) { + const Op* op = expr->get(); + return op && op->type() == OpType_BroadcastTo; +} + EXPRP InputExpr(EXPRP expr, int input_index) { return expr->inputs().at(input_index)->expr().first; } diff --git a/tools/converter/source/optimizer/merge/MergeHelpers.hpp b/tools/converter/source/optimizer/merge/MergeHelpers.hpp index bfc720c97..a4515ab0e 100644 --- a/tools/converter/source/optimizer/merge/MergeHelpers.hpp +++ b/tools/converter/source/optimizer/merge/MergeHelpers.hpp @@ -48,6 +48,7 @@ bool IsReductionMean(Express::EXPRP expr); bool IsConvolution(Express::EXPRP expr); bool IsExpandDims(Express::EXPRP expr); +bool IsBroadcastTo(Express::EXPRP expr); Express::EXPRP InputExpr(Express::EXPRP expr, int input_index); Express::EXPRP OutputExpr(Express::EXPRP expr, int output_index); diff --git a/tools/converter/source/optimizer/postconvert/RemoveInvalidCast.cpp b/tools/converter/source/optimizer/postconvert/RemoveInvalidCast.cpp index db3d0e86c..4f4001b00 100644 --- a/tools/converter/source/optimizer/postconvert/RemoveInvalidCast.cpp +++ b/tools/converter/source/optimizer/postconvert/RemoveInvalidCast.cpp @@ -12,6 +12,7 @@ #include #include #include "../PostTreatUtils.hpp" +#include using namespace MNN; class RemoveInvalidCast : public PostConverter { public: @@ -67,6 +68,20 @@ class RemoveInvalidCast : public PostConverter { case MNN::OpType_Cast: types[op->outputIndexes[0]] = op->main.AsCastParam()->dstT; break; + // Float Op + case MNN::OpType_PReLU: + case MNN::OpType_Softmax: + case MNN::OpType_Convolution: + case MNN::OpType_ConvolutionDepthwise: + case MNN::OpType_Convolution3D: + case MNN::OpType_Deconvolution: + case MNN::OpType_DeconvolutionDepthwise: + case MNN::OpType_MatMul: + if (op->outputIndexes.size() == 1) { + // 4 is integer matmul + types[op->outputIndexes[0]] = MNN::DataType_DT_FLOAT; + } + break; case MNN::OpType_Const: case MNN::OpType_TrainableParam: types[op->outputIndexes[0]] = op->main.AsBlob()->dataType; @@ -74,6 +89,13 @@ class RemoveInvalidCast : public PostConverter { case MNN::OpType_Fill: types[op->outputIndexes[0]] = types[op->inputIndexes[1]]; break; + case MNN::OpType_Slice: + case MNN::OpType_SliceTf: + case MNN::OpType_Unpack: + for (auto v : op->outputIndexes) { + types[v] = types[op->inputIndexes[0]]; + } + break; case MNN::OpType_Shape: case MNN::OpType_Size: case MNN::OpType_Rank: @@ -111,12 +133,33 @@ class RemoveInvalidCast : public PostConverter { } } break; + // Deform + case MNN::OpType_Broastcast: + case MNN::OpType_Concat: + case MNN::OpType_Crop: + case MNN::OpType_CropAndResize: + case MNN::OpType_Col2Im: + case MNN::OpType_DepthToSpace: + case MNN::OpType_ExpandDims: + case MNN::OpType_Flatten: + case MNN::OpType_Interp: + case MNN::OpType_Interp3D: + case MNN::OpType_Im2Col: + case MNN::OpType_Pack: + case MNN::OpType_Padding: + case MNN::OpType_Permute: + case MNN::OpType_Reshape: + case MNN::OpType_Resize: + case MNN::OpType_StridedSlice: + case MNN::OpType_SpaceToDepth: + case MNN::OpType_Squeeze: + case MNN::OpType_Transpose: + case MNN::OpType_Unsqueeze: + { + types[op->outputIndexes[0]] = types[op->inputIndexes[0]]; + } + break; default: - if (op->inputIndexes.size() > 0) { - for (int i=0; ioutputIndexes.size(); ++i) { - types[op->outputIndexes[i]] = types[op->inputIndexes[0]]; - } - } break; } } @@ -134,7 +177,7 @@ class RemoveInvalidCast : public PostConverter { } if (types[op->inputIndexes[0]] != types[op->outputIndexes[0]]) { iter++; - break; + continue; } if (std::find(net->outputName.begin(), net->outputName.end(), net->tensorName[op->outputIndexes[0]]) != net->outputName.end()) { iter++; diff --git a/tools/cpp/ExprDebug.hpp b/tools/cpp/ExprDebug.hpp index 0665f193c..ce342a0dd 100644 --- a/tools/cpp/ExprDebug.hpp +++ b/tools/cpp/ExprDebug.hpp @@ -1,6 +1,8 @@ #include #include #include +#include +#include #define DUMP_NUM_DATA(type) \ auto data = tensor->host(); \ for (int z = 0; z < outside; ++z) { \ @@ -125,7 +127,7 @@ static void _initDebug() { } return true; }; - MNN::Express::Executor::getGlobalExecutor()->setCallBack(std::move(beforeCallBack), std::move(callBack)); + MNN::Express::ExecutorScope::Current()->setCallBack(std::move(beforeCallBack), std::move(callBack)); } @@ -170,7 +172,7 @@ static void _initTimeTrace() { gTimeTraceInfo->end(info); return true; }; - MNN::Express::Executor::getGlobalExecutor()->setCallBack(std::move(beforeCallBack), std::move(callBack)); + MNN::Express::ExecutorScope::Current()->setCallBack(std::move(beforeCallBack), std::move(callBack)); } template @@ -274,5 +276,5 @@ static void _initTensorStatic() { } return true; }; - MNN::Express::Executor::getGlobalExecutor()->setCallBack(std::move(beforeCallBack), std::move(callBack)); + MNN::Express::ExecutorScope::Current()->setCallBack(std::move(beforeCallBack), std::move(callBack)); } diff --git a/tools/cpp/ModuleBasic.cpp b/tools/cpp/ModuleBasic.cpp index aacb1d4d6..c9b4e93ad 100644 --- a/tools/cpp/ModuleBasic.cpp +++ b/tools/cpp/ModuleBasic.cpp @@ -232,6 +232,13 @@ int main(int argc, char *argv[]) { // Need tensor static for each op, open debug rtmgr->setMode(Interpreter::Session_Debug); } + // For Debug + if (false) { + int geometryMask = Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_ALL; + geometryMask -= Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_FUSEREGION; + geometryMask -= Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_OPENCACHE; + rtmgr->setHint(Interpreter::GEOMETRY_COMPUTE_MASK, geometryMask); + } if (runMask & 4) { // Need time trace for each op, open debug rtmgr->setMode(Interpreter::Session_Debug); diff --git a/tools/script/apply_gptq.py b/tools/script/apply_gptq.py new file mode 100644 index 000000000..06e81c965 --- /dev/null +++ b/tools/script/apply_gptq.py @@ -0,0 +1,187 @@ +import json +import torch +import argparse + +class MNNWeight: + def __init__(self, name, external, a_min): + self.name = name + self.external = external + self.a_min = a_min + self.parse_name() + + def __repr__(self) -> str: + return f'{self.layer_id}.{self.op_id}.{self.block_id}, {self.external}' + + def parse_name(self): + parts = self.name.split('/') + if len(parts) > 4: + self.layer_id = parts[1].split('.')[1] + self.op_id = parts[2] + '.' + parts[3] + self.block_id = parts[-1].split('__')[-1] + else: + self.layer_id = -1 + self.op_id = parts[2] + self.block_id = parts[-1].split('__')[-1] + + def key(self): return f'{self.layer_id}.{self.op_id}' + def idx(self): return int(self.block_id) + def offset(self): return self.external[0] + def weight_size(self): return self.external[1] + def scale_size(self): return self.external[2] + +def weight_reorder(qweight, bits=4, group_size=128): + oc = qweight.shape[-1] + wf = torch.tensor(list(range(0, 32, bits)), dtype=torch.int32).unsqueeze(0) + weight = torch.bitwise_right_shift(torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1), wf.unsqueeze(-1)).to(torch.int16 if bits == 8 else torch.int8) + torch.bitwise_and(weight, (2 ** bits) - 1, out=weight) + weight = weight.reshape(-1, oc).transpose(1, 0) + weight = weight.reshape(-1, 2).to(torch.uint8) + weight = weight[:, 0] * 16 + weight[:, 1] + return weight + +class MNNModel: + def __init__(self, model, weight): + self.mnn_graph = json.load(open(model, 'rt')) + self.external_weight = weight + self.parse_conv() + + def parse_conv(self): + self.weights = [] + for op in self.mnn_graph['oplists']: + if op['type'] == 'Convolution': + name = op['name'] + external = op['main']['external'] + a_min = op['main']['quanParameter']['aMin'] + self.weights.append(MNNWeight(name, external, a_min)) + + def apply_weight_split(self, gptq_tensor): + bin_file = open(self.external_weight, 'r+b') + for mnn_weight in self.weights: + idx = mnn_weight.idx() + gptq_weight = gptq_tensor.get(mnn_weight.key()) + if gptq_weight is None: continue + print(f'write {mnn_weight.key()}.{idx} ... ', end='') + weight = gptq_weight.weight(idx) + scale = gptq_weight.scale(idx).float() + # write weight data + weight = weight_reorder(weight) + weight_bytes = weight.numpy().tobytes() + weight_size = mnn_weight.weight_size() + header_len = weight_size - len(weight_bytes) + assert(header_len > 0) + bin_file.seek(mnn_weight.offset() + header_len) + bin_file.write(weight_bytes) + scale_size = mnn_weight.scale_size() + is_asy = scale.numel() * scale.element_size() < scale_size + # write scale data + if is_asy: + zeros = mnn_weight.a_min * scale + scale = torch.stack([zeros, scale], axis=-1) + scale_bytes = scale.numpy().tobytes() + assert(scale_size == len(scale_bytes)) + bin_file.write(scale_bytes) + print('Done!') + # break + bin_file.close() + + def apply_weight(self, gptq_tensor): + bin_file = open(self.external_weight, 'r+b') + for mnn_weight in self.weights: + gptq_weight = gptq_tensor.get(mnn_weight.key()) + if gptq_weight is None: continue + print(f'write {mnn_weight.key()} ... ', end='') + weight = gptq_weight.qweight + scale = gptq_weight.scales.float().transpose(1, 0) + # write weight data + weight = weight_reorder(weight) + weight_bytes = weight.numpy().tobytes() + weight_size = mnn_weight.weight_size() + header_len = weight_size - len(weight_bytes) + assert(header_len > 0) + bin_file.seek(mnn_weight.offset() + header_len) + bin_file.write(weight_bytes) + scale_size = mnn_weight.scale_size() + is_asy = scale.numel() * scale.element_size() < scale_size + # write scale data + if is_asy: + zeros = mnn_weight.a_min * scale + scale = torch.stack([zeros, scale], axis=-1) + scale_bytes = scale.numpy().tobytes() + assert(scale_size == len(scale_bytes)) + bin_file.write(scale_bytes) + print('Done!') + bin_file.close() + + def apply(self, gptq_tensor): + if self.weights[0].block_id.isdigit(): + self.apply_weight_split(gptq_tensor) + else: + self.apply_weight(gptq_tensor) + +class GPTQWeight: + def __init__(self, name): + self.name = name + + def __repr__(self) -> str: + if hasattr(self, 'qweight'): + return f'{self.name}, {self.qweight.shape}, {self.scales.shape}' + return 'None' + + def add(self, name, tensor): + setattr(self, name, tensor) + + def weight(self, idx): + shape = self.qweight.shape + if len(shape) == 2: + ic, oc = shape + self.qweight = self.qweight.reshape(ic//16, 16, oc) + return self.qweight[idx] + + def scale(self, idx): + return self.scales[idx] + +class GPTQTensor: + def __init__(self, file): + self.file = file + self.load() + + def prefix(self, name): + splits = name.split('.') + if len(splits) < 5: + return None, None + pre = f'{splits[2]}.{splits[3]}.{splits[4]}' + suf = splits[-1] + return pre, suf + + def __repr__(self) -> str: + return self.weight_dict.__repr__() + + def get(self, key : str): + if key in self.weight_dict: + return self.weight_dict[key] + return None + + def load(self): + self.weight_dict = dict() + from safetensors import safe_open + with safe_open(self.file, framework="pt") as f: + for k in f.keys(): + p, s = self.prefix(k) + if p is None: continue + if s not in ['qweight', 'scales']: continue + if p not in self.weight_dict: + self.weight_dict[p] = GPTQWeight(p) + self.weight_dict[p].add(s, f.get_tensor(k)) + +def main(args): + mnn_model = MNNModel(args.mnn_graph, args.mnn_weight) + gptq_weight = GPTQTensor(args.gptq_tensor) + mnn_model.apply(gptq_weight) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='apply_gptq', formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--mnn_graph', type=str, required=True, help='mnn graph json path.') + parser.add_argument('--mnn_weight', type=str, required=True, help='mnn weight file path.') + parser.add_argument('--gptq_tensor', type=str, required=True, help='gptq tensor path.') + args = parser.parse_args() + main(args) diff --git a/transformers/llm/config.json b/transformers/llm/config.json new file mode 100755 index 000000000..7025fad4b --- /dev/null +++ b/transformers/llm/config.json @@ -0,0 +1,9 @@ +{ + "llm_model": "llm.mnn", + "llm_weight": "llm.mnn.weight", + + "backend_type": "cpu", + "thread_num": 4, + "precision": "low", + "memory": "low" +} diff --git a/transformers/llm/engine/include/llm.hpp b/transformers/llm/engine/include/llm.hpp index 306cd66f9..4754010cf 100644 --- a/transformers/llm/engine/include/llm.hpp +++ b/transformers/llm/engine/include/llm.hpp @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include #include @@ -21,9 +23,11 @@ #include #include #include "tokenizer.hpp" +#include "rapidjson/document.h" using namespace MNN; using namespace Express; +using namespace rapidjson; class Tokenizer; class Pipeline; @@ -46,358 +50,293 @@ class LlmStreamBuffer : public std::streambuf { CallBack callback_ = nullptr; }; -class MNN_PUBLIC Llm { -public: - Llm() { - // default tokenier is senrencepiece - tokenizer_.reset(new Sentencepiece); +static inline bool has_suffix(const std::string& str, const std::string& suffix) { + return str.size() >= suffix.size() && + str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0; +} + +static inline std::string base_dir(const std::string& path) { + size_t pos = path.find_last_of("/\\"); + if (pos == std::string::npos) { + return "./"; + } else { + return path.substr(0, pos + 1); } - virtual ~Llm() { - decode_modules_.clear(); - prefill_modules_.clear(); - modules_.clear(); - visual_module_.reset(); - runtime_manager_.reset(); +} + +static inline std::string file_name(const std::string& path) { + size_t pos = path.find_last_of("/\\"); + if (pos == std::string::npos) { + return path; + } else { + return path.substr(pos + 1); } - // Default memory is low, precision is low - static Llm* createLLM(const std::string& path, std::string model_type = "auto", int forwardType = 0, int memoryprecison = 10); - void load(const std::string& model_dir); - void chat(); - void trace(bool start); - void warmup(); - std::string response(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr); - std::string response_nohistory(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr); - float load_progress() { return load_progress_; } - void reset(); - void print_speed(); - friend class Pipeline; -public: - std::vector history_; - // forward info - int max_seq_len_ = 1024; - int prompt_len_ = 0; - int gen_seq_len_ = 0; - int all_seq_len_ = 0; - // time - int64_t prefill_us_ = 0; - int64_t decode_us_ = 0; -protected: - void response_init(); - std::string response_impl(const std::vector& input_ids, std::ostream* os, const char* end_with); - VARP embedding(const std::vector& input_ids); - VARP txt_embedding(const std::vector& input_ids); - int forward(const std::vector& input_ids); - std::vector tokenizer_encode(const std::string& input_str); - std::string decode(int id); -protected: - VARP inputs_embeds_, attention_mask_, position_ids_; - // model configs - bool is_single_ = false; - bool is_disk_embedding_ = false; - bool is_visual_ = false; - int layer_nums_ = 0; - int hidden_size_ = 4096; - std::vector key_value_shape_ = {}; - std::string model_name_ = ""; - std::string disk_embedding_file_ = ""; - // gen info - float load_progress_ = 0.f; - // tokenizer - std::unique_ptr tokenizer_; - std::shared_ptr visual_module_; -private: - virtual VARP visual_embedding(const std::vector& input_ids) { return nullptr; } - virtual std::vector tokenizer(const std::string& query) = 0; - virtual VARP gen_attention_mask(int seq_len) = 0; - virtual VARP gen_position_ids(int seq_len) = 0; - virtual bool is_stop(int token_id) = 0; -private: - // MNN Modules - std::shared_ptr runtime_manager_; - std::vector> modules_; - std::vector> decode_modules_; - std::vector> prefill_modules_; - std::vector past_key_values_; - // model dir - std::string model_dir_; - int mForwardType = 0; - int mPrecisionMemory = 0; -}; +} -// some llm models -class Chatglm_6b : public Llm { +class rapid_json_wrapper { public: - Chatglm_6b() { - model_name_ = "Chatglm_6b"; - layer_nums_ = 28; - key_value_shape_ = {2, 0, 1, 32, 128}; + Document document; + rapid_json_wrapper() {} + rapid_json_wrapper(Document doc) : document(std::move(doc)) {} + static rapid_json_wrapper parse(const std::ifstream& ifile) { + std::ostringstream ostr; + ostr << ifile.rdbuf(); + Document document; + document.Parse(ostr.str().c_str()); + rapid_json_wrapper json_wrapper(std::move(document)); + return json_wrapper; + } + static rapid_json_wrapper parse(const char* str) { + Document document; + document.Parse(str); + rapid_json_wrapper json_wrapper(std::move(document)); + return json_wrapper; } -private: - virtual std::vector tokenizer(const std::string& query) override; - virtual VARP gen_attention_mask(int seq_len) override; - virtual VARP gen_position_ids(int seq_len) override; - virtual bool is_stop(int token_id) override; - int context_len_ = 0; -}; -class Chatglm2_6b : public Llm { -public: - Chatglm2_6b() { - model_name_ = "Chatglm2_6b"; - layer_nums_ = 28; - key_value_shape_ = {2, 0, 1, 2, 128}; + template + T value(const char* key, const T& defualt_value) const { + if (document.HasMember(key)) { + const auto& value = document[key]; + if constexpr (std::is_same::value) { + if (value.IsInt()) return value.GetInt(); + } else if constexpr (std::is_same::value || std::is_same::value) { + if (value.IsString()) return value.GetString(); + } else if constexpr (std::is_same::value) { + if (value.IsBool()) return value.GetBool(); + } else if constexpr (std::is_same>::value) { + if (value.IsArray()) { + std::vector result; + for (auto& v : value.GetArray()) { + if (v.IsInt()) { + result.push_back(v.GetInt()); + } + } + return result; + } + } + } + return defualt_value; + } + std::string value(const char key[], const char defualt_value[]) const { + return value(key, std::string(defualt_value)); } -private: - virtual std::vector tokenizer(const std::string& query) override; - virtual VARP gen_attention_mask(int seq_len) override; - virtual VARP gen_position_ids(int seq_len) override; - virtual bool is_stop(int token_id) override; }; -class Phi_2 : public Chatglm2_6b { +class LlmConfig { public: - Phi_2() { - model_name_ = "Phi_2"; - layer_nums_ = 32; - key_value_shape_ = {1, 0, 2, 32, 80}; - hidden_size_ = 2560; - tokenizer_.reset(new Tiktoken); + std::string base_dir_; + rapid_json_wrapper config_, llm_config_; + LlmConfig() {} + LlmConfig(const std::string& path) { + // load config + if (has_suffix(path, ".json")) { + std::ifstream config_file(path); + if (config_file.is_open()) { + config_ = rapid_json_wrapper::parse(config_file); + } else { + std::cerr << "Unable to open config file: " << path << std::endl; + } + base_dir_ = base_dir(path); + } else { + // compatibility with the original usage + if (has_suffix(path, ".mnn")) { + auto model_name = file_name(path); + std::string json_str = R"({ + "llm_model": ")" + model_name + R"(", + "llm_weight": ")" + model_name + R"(.weight" + })"; + config_ = rapid_json_wrapper::parse(json_str.c_str()); + base_dir_ = base_dir(path); + } else { + const char* json_cstr = "{}"; + config_ = rapid_json_wrapper::parse(json_cstr); + base_dir_ = path; + } + } + // using config's base_dir + base_dir_ = config_.value("base_dir", base_dir_); + // load llm_config for model info + std::ifstream llm_config_file(llm_config()); + if (llm_config_file.is_open()) { + llm_config_ = rapid_json_wrapper::parse(llm_config_file); + } else { + std::cerr << "Unable to open llm_config file: " << llm_config() << std::endl; + } } -private: - virtual std::vector tokenizer(const std::string& query) override; - virtual bool is_stop(int token_id) override; -}; -class Qwen_7b : public Llm { -public: - Qwen_7b() { - model_name_ = "Qwen_7b"; - layer_nums_ = 32; - key_value_shape_ = {2, 1, 0, 32, 128}; - hidden_size_ = 4096; - tokenizer_.reset(new Tiktoken); + // < model file config start + std::string llm_config() const { + return base_dir_ + config_.value("llm_config", "llm_config.json"); } -private: - virtual std::vector tokenizer(const std::string& query) override; - virtual VARP gen_attention_mask(int seq_len) override; - virtual VARP gen_position_ids(int seq_len) override; - virtual bool is_stop(int token_id) override; -}; -class Qwen_vl : public Qwen_7b { -public: - Qwen_vl() { - model_name_ = "Qwen_vl"; - is_visual_ = true; - layer_nums_ = 32; - key_value_shape_ = {2, 1, 0, 32, 128}; - hidden_size_ = 4096; - tokenizer_.reset(new Tiktoken); + std::string llm_model() const { + return base_dir_ + config_.value("llm_model", "llm.mnn"); } -private: - const int img_size_ = 448; - const int imgpad_len_ = 256; - const int img_start_ = 151857; - const int img_end_ = 151858; - const int img_pad_ = 151859; -private: - std::vector url_encode(const std::string& url); - virtual VARP visual_embedding(const std::vector& input_ids) override; - virtual std::vector tokenizer(const std::string& query) override; - virtual VARP gen_attention_mask(int seq_len) override; -}; -class Qwen_1_8b : public Qwen_7b { -public: - Qwen_1_8b() { - model_name_ = "Qwen_1.8b"; - layer_nums_ = 24; - key_value_shape_ = {2, 1, 0, 16, 128}; - hidden_size_ = 2048; - tokenizer_.reset(new Tiktoken); + std::string llm_weight() const { + return base_dir_ + config_.value("llm_weight", "llm.mnn.weight"); } -}; -class Llama2_7b : public Llm { -public: - Llama2_7b() { - model_name_ = "Llama2_7b"; - layer_nums_ = 32; - key_value_shape_ = {2, 1, 32, 0, 128}; + std::string block_model(int index) const { + return base_dir_ + config_.value("block_model", "block_") + std::to_string(index) + ".mnn"; } -private: - virtual std::vector tokenizer(const std::string& query) override; - virtual VARP gen_attention_mask(int seq_len) override; - virtual VARP gen_position_ids(int seq_len) override; - virtual bool is_stop(int token_id) override; -}; -class MiniCPM_1_2b : public Llama2_7b { -public: - MiniCPM_1_2b() { - model_name_ = "MiniCPM_1_2b"; - layer_nums_ = 52; - key_value_shape_ = {2, 1, 8, 0, 64}; - hidden_size_ = 1536; + std::string lm_model() const { + return base_dir_ + config_.value("lm_model", "lm.mnn"); } -private: - virtual std::vector tokenizer(const std::string& query) override; -}; -class MiniCPM_2_4b : public Llama2_7b { -public: - MiniCPM_2_4b() { - model_name_ = "MiniCPM_1_2b"; - layer_nums_ = 40; - key_value_shape_ = {2, 1, 36, 0, 64}; - hidden_size_ = 2304; + std::string embedding_model() const { + return base_dir_ + config_.value("embedding_model", "embedding.mnn"); } -private: - virtual std::vector tokenizer(const std::string& query) override; -}; -class Llama3_8b : public Llama2_7b { -public: - Llama3_8b() { - model_name_ = "Llama3_8b"; - layer_nums_ = 32; - key_value_shape_ = {2, 1, 8, 0, 128}; - hidden_size_ = 4096; + std::string embedding_file() const { + return base_dir_ + config_.value("embedding_file", "embeddings_bf16.bin"); } -private: - virtual std::vector tokenizer(const std::string& query) override; - virtual bool is_stop(int token_id) override; -}; -class Qwen2 : public Llama2_7b { -public: - Qwen2() { - model_name_ = "Qwen2"; - tokenizer_.reset(new HuggingfaceTokenizer); + + std::string tokenizer_file() const { + return base_dir_ + config_.value("tokenizer_file", "tokenizer.txt"); } -private: - virtual std::vector tokenizer(const std::string& query) override; - virtual bool is_stop(int token_id) override; -}; -class Qwen2_0_5b : public Qwen2 { -public: - Qwen2_0_5b() { - model_name_ = "Qwen2_0.5b"; - layer_nums_ = 24; - key_value_shape_ = {2, 1, 0, 16, 64}; - hidden_size_ = 1024; + std::string visual_model() const { + return base_dir_ + config_.value("visual_model", "visual.mnn"); } -}; + // model file config end > -class Qwen2_1_8b : public Qwen2 { -public: - Qwen2_1_8b() { - model_name_ = "Qwen2_1.8b"; - layer_nums_ = 24; - key_value_shape_ = {2, 1, 16, 0, 128}; - hidden_size_ = 2048; + // < generate config start + int max_new_tokens() const { + return config_.value("max_new_tokens", 512); } -}; + // generate config end > -class Qwen2_4b : public Qwen2 { -public: - Qwen2_4b() { - model_name_ = "Qwen2_4b"; - layer_nums_ = 40; - key_value_shape_ = {2, 1, 20, 0, 128}; - hidden_size_ = 2560; + // < backend config start + std::string backend_type() const { + return config_.value("backend_type", "cpu"); } -}; -class Qwen2_7b : public Qwen2 { -public: - Qwen2_7b() { - model_name_ = "Qwen2_7b"; - layer_nums_ = 32; - key_value_shape_ = {2, 1, 32, 0, 128}; - hidden_size_ = 4096; + int thread_num() const { + return config_.value("thread_num", 4); } -}; -class TinyLlama : public Llama2_7b { -public: - TinyLlama() { - model_name_ = "TinyLlama"; - layer_nums_ = 22; - key_value_shape_ = {2, 1, 4, 0, 64}; + std::string precision() const { + return config_.value("precision", "low"); } -private: - virtual std::vector tokenizer(const std::string& query) override; -}; -class Yi_6b : public Llama2_7b { -public: - Yi_6b() { - model_name_ = "Yi_6b"; - key_value_shape_ = {2, 1, 4, 0, 128}; + std::string memory() const { + return config_.value("memory", "low"); } -private: - virtual std::vector tokenizer(const std::string& query) override; - virtual bool is_stop(int token_id) override; -}; -// Llm end + // backend config end > -// Embedding start -class Embedding { -public: - Embedding() { - // default tokenier is Bert - tokenizer_.reset(new BertTokenizer); + // < llm model config start + bool is_single() const { + return llm_config_.value("is_single", true); } - virtual ~Embedding() { - module_.reset(); - runtime_manager_.reset(); + + bool is_visual() const { + return llm_config_.value("is_visual", false); } - static Embedding* createEmbedding(const std::string& path, std::string model_type = "auto"); - static float dist(VARP var0, VARP var1); - void load(const std::string& model_dir); - VARP embedding(const std::string& txt); + + int hidden_size() const { + return llm_config_.value("hidden_size", 4096); + } + + int layer_nums() const { + return llm_config_.value("layer_nums", 32); + } + + std::vector key_value_shape() const { + return llm_config_.value("key_value_shape", std::vector{}); + } + + std::string attention_mask() const { + return llm_config_.value("attention_mask", "int"); + } + + std::string prompt_template() const { + return llm_config_.value("prompt_template", ""); + } + // llm model config end > +}; + +class MNN_PUBLIC Llm { +public: + Llm(std::shared_ptr config) : config_(config) {} + virtual ~Llm(); + static Llm* createLLM(const std::string& config_path); + void chat(); + void trace(bool start); + virtual void load(); + VARP forward(const std::vector& input_ids); + int sample(VARP logits, const std::vector& pre_ids); + std::string apply_chat_template(const std::string& input_str) const; + std::string response(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr); + void generate_init(); + std::string generate(const std::vector& input_ids, std::ostream* os, const char* end_with); + std::vector generate(const std::vector& input_ids, int max_new_tokens = -1); void print_speed(); - int dim() { return hidden_size_; } + friend class Pipeline; public: - // time - int64_t embedding_us_ = 0; + // forward info int prompt_len_ = 0; -protected: - std::vector tokenizer_encode(const std::string& input_str); -protected: - // model configs - int layer_nums_ = 0; - int hidden_size_ = 1024; - std::string model_name_ = ""; - // tokenizer + int gen_seq_len_ = 0; + int all_seq_len_ = 0; + // time + int64_t prefill_us_ = 0; + int64_t decode_us_ = 0; + bool is_single_ = true; + std::shared_ptr config_; std::unique_ptr tokenizer_; -private: - virtual std::vector tokenizer(const std::string& query) = 0; - virtual VARP gen_attention_mask(int seq_len) = 0; - virtual VARP gen_position_ids(int seq_len) = 0; -private: - // MNN Modules +protected: + std::vector key_value_shape_ = {}; + std::vector past_key_values_; + VARP inputs_embeds_, attention_mask_, position_ids_; std::shared_ptr runtime_manager_; - std::shared_ptr module_; - // model dir - std::string model_dir_; + std::vector> modules_; + std::vector> decode_modules_; + std::vector> prefill_modules_; + void init_runtime(); + std::string decode(int id); + bool is_stop(int token_id); + virtual std::vector tokenizer(const std::string& query); + virtual VARP embedding(const std::vector& input_ids); + virtual VARP gen_attention_mask(int seq_len); + virtual VARP gen_position_ids(int seq_len); }; -// some embedding models -class Bge : public Embedding { +class Lvlm : public Llm { public: - Bge() { - model_name_ = "Bge"; - layer_nums_ = 24; - hidden_size_ = 1024; + Lvlm(std::shared_ptr config) : Llm(config) { + img_size_ = config->llm_config_.value("img_size", img_size_); + imgpad_len_ = config->llm_config_.value("imgpad_len", imgpad_len_); + img_start_ = config->llm_config_.value("img_start", img_start_); + img_end_ = config->llm_config_.value("img_end", img_end_); + img_pad_ = config->llm_config_.value("img_pad", img_pad_); } + ~Lvlm() { visual_module_.reset(); } + virtual void load() override; +private: + int img_size_ = 448, imgpad_len_ = 256, img_start_ = 151857, img_end_ = 151858, img_pad_ = 151859; + std::shared_ptr visual_module_; + VARP visual_embedding(const std::vector& input_ids); + std::vector url_encode(const std::string& url); + virtual std::vector tokenizer(const std::string& query) override; + virtual VARP embedding(const std::vector& input_ids) override; +}; +// Llm end + +// Embedding start +class Embedding : public Llm { +public: + Embedding(std::shared_ptr config) : Llm(config) {} + static Embedding* createEmbedding(const std::string& config_path); + static float dist(VARP var0, VARP var1); + virtual void load() override; + VARP embedding(const std::string& txt); + int dim() { return config_->hidden_size(); } private: virtual std::vector tokenizer(const std::string& query) override; virtual VARP gen_attention_mask(int seq_len) override; virtual VARP gen_position_ids(int seq_len) override; }; - // Embedding end #endif // LLM_hpp diff --git a/transformers/llm/engine/include/tokenizer.hpp b/transformers/llm/engine/include/tokenizer.hpp index 7711e0eee..16d10861d 100644 --- a/transformers/llm/engine/include/tokenizer.hpp +++ b/transformers/llm/engine/include/tokenizer.hpp @@ -17,19 +17,35 @@ class Tokenizer { public: + static constexpr int MAGIC_NUMBER = 430; + enum TokenizerType { + SENTENCEPIECE = 0, + TIKTOIKEN = 1, + BERT = 2, + HUGGINGFACE = 3 + }; Tokenizer() = default; virtual ~Tokenizer() = default; - virtual bool load(const std::string& filename) = 0; - virtual std::vector encode(const std::string& str) = 0; + static Tokenizer* createTokenizer(const std::string& filename); + bool is_stop(int token); + std::vector encode(const std::string& str); virtual std::string decode(int id) = 0; +protected: + virtual void load_special(std::ifstream& file); + virtual bool load_vocab(std::ifstream& file) = 0; + virtual void encode(const std::string& str, std::vector& ids) = 0; + std::vector special_tokens_; + std::vector stop_tokens_; + std::vector prefix_tokens_; }; class Sentencepiece : public Tokenizer { public: Sentencepiece() = default; - virtual bool load(const std::string& filename) override; - virtual std::vector encode(const std::string& str) override; virtual std::string decode(int id) override; +protected: + virtual bool load_vocab(std::ifstream& file) override; + virtual void encode(const std::string& str, std::vector& ids) override; private: enum ModelType { UNIGRAM = 1, @@ -76,10 +92,10 @@ class Sentencepiece : public Tokenizer { class Tiktoken : public Tokenizer { public: Tiktoken() = default; - virtual bool load(const std::string& filename) override; - virtual std::vector encode(const std::string& str) override; virtual std::string decode(int id) override; protected: + virtual bool load_vocab(std::ifstream& file) override; + virtual void encode(const std::string& str, std::vector& ids) override; std::unordered_map encoder_; std::vector decoder_; }; @@ -87,7 +103,8 @@ class Tiktoken : public Tokenizer { class BertTokenizer : public Tiktoken { public: BertTokenizer() = default; - virtual std::vector encode(const std::string& str) override; +protected: + virtual void encode(const std::string& str, std::vector& ids) override; private: std::vector word_piece(const std::string& token); }; @@ -104,9 +121,10 @@ struct hash_pair_wstring { using BPERanks = std::unordered_map, int, hash_pair_wstring>; public: HuggingfaceTokenizer() = default; - virtual bool load(const std::string& filename) override; - virtual std::vector encode(const std::string& str) override; virtual std::string decode(int id) override; +protected: + virtual bool load_vocab(std::ifstream& file) override; + virtual void encode(const std::string& str, std::vector& ids) override; private: void bpe(const std::wstring& token, const BPERanks& bpe_ranks, std::vector* result); BPERanks bpe_ranks_; diff --git a/transformers/llm/engine/llm_demo.cpp b/transformers/llm/engine/llm_demo.cpp index f8eb04593..3c6512661 100644 --- a/transformers/llm/engine/llm_demo.cpp +++ b/transformers/llm/engine/llm_demo.cpp @@ -30,7 +30,6 @@ static void trace_prepare(Llm* llm) { decode_len += llm->gen_seq_len_; prefill_time += llm->prefill_us_; decode_time += llm->decode_us_; - llm->reset(); } MNN_PRINT("Prepare for resize opt End\n"); llm->trace(false); @@ -96,7 +95,6 @@ static int benchmark(Llm* llm, const std::vector& prompts) { decode_len += llm->gen_seq_len_; prefill_time += llm->prefill_us_; decode_time += llm->decode_us_; - llm->reset(); } float prefill_s = prefill_time / 1e6; float decode_s = decode_time / 1e6; @@ -114,7 +112,6 @@ static int benchmark(Llm* llm, const std::vector& prompts) { static int ceval(Llm* llm, const std::vector& lines, std::string filename) { auto csv_data = parse_csv(lines); int right = 0, wrong = 0; - llm->max_seq_len_ = 512; std::vector answers; for (int i = 1; i < csv_data.size(); i++) { const auto& elements = csv_data[i]; @@ -127,7 +124,6 @@ static int ceval(Llm* llm, const std::vector& lines, std::string fi printf("%s", prompt.c_str()); printf("## 进度: %d / %lu\n", i, lines.size() - 1); auto res = llm->response(prompt.c_str()); - llm->reset(); answers.push_back(res); } { @@ -175,33 +171,23 @@ static int eval(Llm* llm, std::string prompt_file) { int main(int argc, const char* argv[]) { if (argc < 2) { - std::cout << "Usage: " << argv[0] << " model_dir " << std::endl; + std::cout << "Usage: " << argv[0] << " config.json " << std::endl; return 0; } - std::string model_dir = argv[1]; - int forwardType = 0; - if (argc >= 3) { - std::istringstream os(argv[2]); - os >> forwardType; - } - int memoryprecision = 10; - if (argc >= 4) { - std::istringstream os(argv[3]); - os >> memoryprecision; - } - std::cout << "model path is " << model_dir << std::endl; - std::unique_ptr llm(Llm::createLLM(model_dir, "auto", forwardType, memoryprecision)); + std::string config_path = argv[1]; + std::cout << "config path is " << config_path << std::endl; + std::unique_ptr llm(Llm::createLLM(config_path)); { AUTOTIME; - llm->load(model_dir); + llm->load(); } - { + if (true) { AUTOTIME; trace_prepare(llm.get()); } - if (argc < 5) { + if (argc < 3) { llm->chat(); } - std::string prompt_file = argv[4]; + std::string prompt_file = argv[2]; return eval(llm.get(), prompt_file); } diff --git a/transformers/llm/engine/src/llm.cpp b/transformers/llm/engine/src/llm.cpp index 77bf71fa2..86ff80596 100644 --- a/transformers/llm/engine/src/llm.cpp +++ b/transformers/llm/engine/src/llm.cpp @@ -16,6 +16,8 @@ #include "cpp/ExprDebug.hpp" #include "llm.hpp" #include "tokenizer.hpp" +// 0: no debug, 1: test op time, 2: print tensor info +#define DEBUG_MODE 0 #ifdef USING_VISUAL_MODEL #include "httplib.h" @@ -23,87 +25,190 @@ #endif // Llm start -Llm* Llm::createLLM(const std::string& path, std::string model_type, int forwardType, int preicsionmemory) { - auto size = path.size(); - - // end with '.mnn' is single model file, otherwise split block models - bool is_single = (size > 4 && - path[size - 4] == '.' && - path[size - 3] == 'm' && - path[size - 2] == 'n' && - path[size - 1] == 'n'); +Llm* Llm::createLLM(const std::string& config_path) { + std::shared_ptr config(new LlmConfig(config_path)); Llm* llm = nullptr; - if (model_type == "auto") { - model_type = path; - } - if (model_type.find("chatglm") != std::string::npos) { - if (model_type.find("chatglm2") != std::string::npos) { - llm = new Chatglm2_6b; - } else if (model_type.find("chatglm3") != std::string::npos) { - llm = new Chatglm2_6b; - llm->model_name_ = "Chatglm3_6b"; - } else { - llm = new Chatglm_6b; + if (config->is_visual()) { + llm = new Lvlm(config); + } else { + llm = new Llm(config); + } + return llm; +} + +static MNNForwardType backend_type_convert(const std::string& type_str) { + if (type_str == "cpu") return MNN_FORWARD_CPU; + if (type_str == "metal") return MNN_FORWARD_METAL; + if (type_str == "cuda") return MNN_FORWARD_CUDA; + if (type_str == "opencl") return MNN_FORWARD_OPENCL; + if (type_str == "opengl") return MNN_FORWARD_OPENGL; + if (type_str == "vulkan") return MNN_FORWARD_VULKAN; + if (type_str == "npu") return MNN_FORWARD_NN; + return MNN_FORWARD_AUTO; +} + +void Llm::init_runtime() { + ScheduleConfig config; + BackendConfig cpuBackendConfig; + config.type = backend_type_convert(config_->backend_type()); + config.numThread = config_->thread_num(); + if (config_->memory() == "low") { + cpuBackendConfig.memory = BackendConfig::Memory_Low; + } + if (config_->precision() == "low") { + cpuBackendConfig.precision = BackendConfig::Precision_Low; + } + config.backendConfig = &cpuBackendConfig; + ExecutorScope::Current()->setGlobalExecutorConfig(config.type, cpuBackendConfig, config.numThread); + + runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config)); + runtime_manager_->setHint(MNN::Interpreter::MEM_ALLOCATOR_TYPE, 0); +#if DEBUG_MODE==1 + runtime_manager_->setMode(MNN::Interpreter::Session_Debug); + _initTimeTrace(); +#endif +#if DEBUG_MODE==2 + runtime_manager_->setMode(MNN::Interpreter::Session_Debug); + _initTensorStatic(); +#endif + { + runtime_manager_->setCache(".tempcache"); + } +} + +void Llm::load() { + init_runtime(); + // init module status + key_value_shape_ = config_->key_value_shape(); + is_single_ = config_->is_single(); + { + std::ifstream embedding_bin(config_->embedding_file()); + embedding_bin.close(); + } + MNN_PRINT("### is_single_ = %d\n", is_single_); + // 1. load vocab + MNN_PRINT("load tokenizer\n"); + tokenizer_.reset(Tokenizer::createTokenizer(config_->tokenizer_file())); + MNN_PRINT("load tokenizer Done\n"); + // 3. load model + Module::Config module_config; + module_config.shapeMutable = true; + module_config.rearrange = true; + int layer_nums = config_->layer_nums(); + if (is_single_) { + // load single model + key_value_shape_.insert(key_value_shape_.begin(), layer_nums); + modules_.resize(1); + std::string model_path = config_->llm_model(); + MNN_PRINT("load %s ... ", model_path.c_str()); + runtime_manager_->setExternalFile(config_->llm_weight()); + modules_[0].reset(Module::load( + {"input_ids", "attention_mask", "position_ids", "past_key_values"}, + {"logits", "presents"}, model_path.c_str(), runtime_manager_, &module_config)); + MNN_PRINT("Done!\n"); + } else { + // load split models + modules_.resize(layer_nums + 2); + // load lm model + modules_[layer_nums].reset(Module::load({}, {}, config_->lm_model().c_str(), runtime_manager_, &module_config)); + // load block models + for (int i = 0; i < layer_nums; i++) { + std::string model_path = config_->block_model(i); + MNN_PRINT("load %s ... ", model_path.c_str()); + modules_[i].reset(Module::load( + {"inputs_embeds", "attention_mask", "position_ids", "past_key_values"}, + {"hidden_states", "presents"}, model_path.c_str(), runtime_manager_, &module_config)); + MNN_PRINT("Done!\n"); } - } else if (model_type.find("codegeex2") != std::string::npos) { - llm = new Chatglm2_6b; - llm->model_name_ = "Codegeex2_6b"; - } else if (model_type.find("qwen1.5") != std::string::npos || - model_type.find("qwen2") != std::string::npos) { - if (model_type.find("0.5b") != std::string::npos) { - llm = new Qwen2_0_5b; - } else if (model_type.find("1.8b") != std::string::npos) { - llm = new Qwen2_1_8b; - } else if (model_type.find("4b") != std::string::npos) { - llm = new Qwen2_4b; - } else if (model_type.find("7b") != std::string::npos) { - llm = new Qwen2_7b; + } + decode_modules_.resize(modules_.size()); + for (int v=0; vtraceOrOptimize(status); + } + runtime_manager_->updateCache(); +} + +VARP Llm::forward(const std::vector& input_ids) { + int seq_len = input_ids.size(); + auto attention_mask = gen_attention_mask(seq_len); + auto position_ids = gen_position_ids(seq_len); + VARP logits; + if (is_single_) { + // single model + auto hidden_states = embedding(input_ids); + auto outputs = modules_.back()->onForward({hidden_states, attention_mask, position_ids, past_key_values_[0]}); + if (outputs.empty()) { + return nullptr; } - } else if (model_type.find("qwen") != std::string::npos) { - if (model_type.find("1.8") != std::string::npos) { - llm = new Qwen_1_8b; - } else if (model_type.find("vl") != std::string::npos) { - llm = new Qwen_vl; - } else { - llm = new Qwen_7b; + ExecutorScope::Current()->gc(Executor::FULL); + logits = outputs[0]; + past_key_values_[0] = outputs[1]; + } else { + // split block models + int layer_nums = config_->layer_nums(); + auto hidden_states = embedding(input_ids); + ExecutorScope::Current()->gc(Executor::FULL); + for (int i = 0; i < layer_nums; i++) { + AUTOTIME; + auto outputs = modules_[i]->onForward({hidden_states, attention_mask, position_ids, past_key_values_[i]}); + hidden_states = outputs[0]; + past_key_values_[i] = outputs[1]; } - } else if (model_type.find("llama2") != std::string::npos) { - llm = new Llama2_7b; - } else if (model_type.find("baichuan") != std::string::npos) { - llm = new Llama2_7b; - llm->model_name_ = "Baichuan2_7b"; - } else if (model_type.find("phi2") != std::string::npos) { - llm = new Phi_2; - } else if (model_type.find("internlm") != std::string::npos) { - llm = new Llama2_7b; - llm->model_name_ = "Internlm_7b"; - } else if (model_type.find("deepseek") != std::string::npos) { - llm = new Llama2_7b; - llm->model_name_ = "deepseek_7b"; - llm->layer_nums_ = 30; - } else if (model_type.find("tinyllama") != std::string::npos) { - llm = new TinyLlama; - llm->model_name_ = "TinyLlama"; - } else if (model_type.find("yi") != std::string::npos) { - llm = new Yi_6b; - llm->model_name_ = "Yi_6b"; - } else if (model_type.find("llama3") != std::string::npos) { - llm = new Llama3_8b; - llm->model_name_ = "Llama3_8b"; - } else if (model_type.find("MiniCPM_1_2b") != std::string::npos) { - llm = new MiniCPM_1_2b; - } else if (model_type.find("MiniCPM_2_4b") != std::string::npos) { - llm = new MiniCPM_2_4b; - } - if (!llm) { - std::cerr << "model type can't judge!" << std::endl; - return llm; - } - llm->mForwardType = forwardType; - llm->is_single_ = is_single; - llm->mPrecisionMemory = preicsionmemory; - std::cout << "### model name : "<< llm->model_name_ << std::endl; - return llm; + ExecutorScope::Current()->gc(Executor::FULL); + { + AUTOTIME; + auto outputs = modules_[layer_nums]->onForward({hidden_states}); + logits = outputs[0]; + } + } + all_seq_len_ += seq_len; + gen_seq_len_++; + return logits; +} + +int Llm::sample(VARP logits, const std::vector& pre_ids) { + auto scores = (float*)(logits->readMap()); + auto size = logits->getInfo()->size; + float max_score = 0; + int token_id = 0; + // repetition penalty + const float repetition_penalty = 1.1; + for (auto id : pre_ids) { + float score = scores[id]; + scores[id] = score < 0 ? score * repetition_penalty : score / repetition_penalty; + } + // argmax + for (int i = 0; i < size; i++) { + float score = scores[i]; + if (score > max_score) { + max_score = score; + token_id = i; + } + } + return token_id; +} + +std::string Llm::apply_chat_template(const std::string& input_str) const { + auto prompt = config_->prompt_template(); + if (prompt.empty()) return input_str; + const std::string placeholder = "%s"; + size_t start_pos = prompt.find(placeholder); + if (start_pos == std::string::npos) return input_str; + prompt.replace(start_pos, placeholder.length(), input_str); + return prompt; } void Llm::chat() { @@ -115,7 +220,7 @@ void Llm::chat() { break; } if (input_str == "/reset") { - reset(); + // reset(); std::cout << "\nA: reset done." << std::endl; continue; } @@ -123,10 +228,9 @@ void Llm::chat() { response(input_str); std::cout << std::endl; } - reset(); } -void Llm::response_init() { +void Llm::generate_init() { // init status gen_seq_len_ = 0; all_seq_len_ = 0; @@ -136,34 +240,72 @@ void Llm::response_init() { if (is_single_) { past_key_values_.push_back(_Input(key_value_shape_, NCHW)); } else { - for (int i = 0; i < layer_nums_; i++) { + for (int i = 0; i < config_->layer_nums(); i++) { past_key_values_.push_back(_Input(key_value_shape_, NCHW)); } } } -std::string Llm::response_impl(const std::vector& input_ids, std::ostream* os, const char* end_with) { +std::vector Llm::generate(const std::vector& input_ids, int max_new_tokens) { + generate_init(); + std::vector output_ids, all_ids = input_ids; prompt_len_ = static_cast(input_ids.size()); + if (max_new_tokens < 0) { max_new_tokens = config_->max_new_tokens(); } + // prefill + auto logits = forward(input_ids); + if (logits.get() == nullptr) { + return {}; + } + int token = sample(logits, all_ids); + output_ids.push_back(token); + all_ids.push_back(token); + // decode + while (gen_seq_len_ < max_new_tokens) { + logits = forward({token}); + if (logits.get() == nullptr) { + return {}; + } + token = sample(logits, all_ids); + if (is_stop(token)) { break; } + output_ids.push_back(token); + all_ids.push_back(token); + } + return output_ids; +} + +std::string Llm::generate(const std::vector& input_ids, std::ostream* os, const char* end_with) { + prompt_len_ = static_cast(input_ids.size()); + std::vector all_ids = input_ids; auto st = std::chrono::system_clock::now(); modules_ = prefill_modules_; - int token = forward(input_ids); + auto logits = forward(input_ids); + if (nullptr == logits.get()) { + return ""; + } + int token = sample(logits, all_ids); + all_ids.push_back(token); auto et = std::chrono::system_clock::now(); - history_.push_back(token); + modules_ = decode_modules_; std::string output_str = decode(token); prefill_us_ = std::chrono::duration_cast(et - st).count(); *os << output_str << std::flush; - modules_ = decode_modules_; - - while (gen_seq_len_ < max_seq_len_) { + while (gen_seq_len_ < config_->max_new_tokens()) { st = std::chrono::system_clock::now(); - token = forward({token}); + logits = forward({token}); + if (nullptr == logits.get()) { + return ""; + } + if (logits->getInfo()->size == 0) { + return ""; + } + token = sample(logits, all_ids); et = std::chrono::system_clock::now(); decode_us_ += std::chrono::duration_cast(et - st).count(); if (is_stop(token)) { *os << end_with << std::flush; break; } - history_.push_back(token); + all_ids.push_back(token); auto word = decode(token); *os << word << std::flush; output_str += word; @@ -174,30 +316,45 @@ std::string Llm::response_impl(const std::vector& input_ids, std::ostream* return output_str; } -std::string Llm::response(const std::string& query, std::ostream* os, const char* end_with) { - response_init(); - if (!end_with) { - end_with = "\n"; - } - // response - auto input_ids = tokenizer(query); - if (!history_.empty()) { - std::copy(input_ids.begin(), input_ids.end(), std::back_inserter(history_)); - input_ids = history_; - } else { - history_ = input_ids; - } - return response_impl(input_ids, os, end_with); +std::vector Llm::tokenizer(const std::string& query) { + auto prompt = apply_chat_template(query); + auto input_ids = tokenizer_->encode(prompt); + return input_ids; } -std::string Llm::response_nohistory(const std::string& query, std::ostream* os, const char* end_with) { - response_init(); - if (!end_with) { - end_with = "\n"; - } - // response +std::string Llm::response(const std::string& query, std::ostream* os, const char* end_with) { + generate_init(); + if (!end_with) { end_with = "\n"; } auto input_ids = tokenizer(query); - return response_impl(input_ids, os, end_with); + return generate(input_ids, os, end_with); +} +Llm::~Llm() { +#if DEBUG_MODE==1 + if (nullptr != gTimeTraceInfo) { + float opSummer = 0.0f; + float opFlopsSummber = 0.0f; + for (auto& iter : gTimeTraceInfo->mTypes) { + float summer = 0.0f; + float summerflops = 0.0f; + for (auto& t : iter.second) { + for (auto& t0 : t.second) { + summer += t0.first; + summerflops += t0.second; + } + } + summer = summer; + summerflops = summerflops; + MNN_PRINT("%s : %.7f, FLOP: %.7f, Speed: %.7f GFlops\n", iter.first.c_str(), summer, summerflops, summerflops / summer); + opSummer += summer; + opFlopsSummber+= summerflops; + } + MNN_PRINT("OP Summer: %.7f, Flops: %.7f, Speed: %.7f GFlops\n", opSummer, opFlopsSummber, opFlopsSummber/opSummer); + } +#endif + decode_modules_.clear(); + prefill_modules_.clear(); + modules_.clear(); + runtime_manager_.reset(); } void Llm::print_speed() { @@ -218,171 +375,6 @@ void Llm::print_speed() { printf("##################################\n"); } -void Llm::reset() { - history_.clear(); -} - -void Llm::load(const std::string& model_dir) { - model_dir_ = model_dir; - // init - ScheduleConfig config; - BackendConfig cpuBackendConfig; - config.type = (MNNForwardType)mForwardType; - if (config.type == MNN_FORWARD_OPENCL) { - config.numThread = MNN_GPU_MEMORY_BUFFER | MNN_GPU_TUNING_NORMAL; - } - ExecutorScope::Current()->setGlobalExecutorConfig(MNN_FORWARD_CPU, cpuBackendConfig, config.numThread); - - cpuBackendConfig.precision = (BackendConfig::PrecisionMode)(mPrecisionMemory % 4); - cpuBackendConfig.memory = (BackendConfig::MemoryMode)((mPrecisionMemory / 4) % 4); - printf("### precision, memory = %d, %d\n", (mPrecisionMemory % 4), ((mPrecisionMemory / 4) % 4)); - config.backendConfig = &cpuBackendConfig; - runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config)); - runtime_manager_->setHint(MNN::Interpreter::MEM_ALLOCATOR_TYPE, 0); -// runtime_manager_->setMode(MNN::Interpreter::Session_Debug); -// _initTensorStatic(); - { - runtime_manager_->setCache(".tempcache"); - } - load_progress_ = 0.f; - printf("load tokenizer\n"); - // 1. load vocab - std::string tokenizer_path = model_dir + "/tokenizer.txt"; - if (is_single_) { - size_t pos = model_dir.find_last_of("/\\"); - std::string dir_path = (pos != std::string::npos) ? model_dir.substr(0, pos + 1) : ""; - model_dir_ = dir_path; - tokenizer_path = dir_path + "/tokenizer.txt"; - } - load_progress_ += 5.f; - tokenizer_->load(tokenizer_path); - load_progress_ += 5.f; - printf("load tokenizer Done\n"); - { - disk_embedding_file_ = model_dir_ + "/embeddings_bf16.bin"; - std::ifstream embedding_bin(disk_embedding_file_); - is_disk_embedding_ = embedding_bin.good(); - MNN_PRINT("### disk embedding is %d\n", is_disk_embedding_); - embedding_bin.close(); - } - // 2. load model - Module::Config module_config; - module_config.shapeMutable = true; - module_config.rearrange = true; - if (is_single_) { - key_value_shape_.insert(key_value_shape_.begin(), layer_nums_); - modules_.resize(1); - std::string model_path = model_dir; - std::string external_path = model_dir + ".weight"; - MNN_PRINT("load %s ... ", model_path.c_str()); - runtime_manager_->setExternalFile(external_path); - modules_[0].reset(Module::load( - {"input_ids", "attention_mask", "position_ids", "past_key_values"}, - {"token_id", "presents"}, model_path.c_str(), runtime_manager_, &module_config)); - MNN_PRINT("Done!\n"); - load_progress_ += 90.f; - } else { - // 2. load models - modules_.resize(layer_nums_ + 2); - float step = 90.0 / modules_.size(); - char buffer[50]; - // load lm model - std::string lm_model_path = model_dir + "/lm.mnn"; - MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, lm_model_path.c_str()); - modules_[layer_nums_].reset(Module::load({}, {}, lm_model_path.c_str(), runtime_manager_, &module_config)); - MNN_PRINT("Done!\n"); - load_progress_ += step; - if (!is_disk_embedding_) { - std::string embedding_model_path = model_dir + "/embedding.mnn"; - MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, embedding_model_path.c_str());fflush(stdout); - modules_[layer_nums_ + 1].reset(Module::load({}, {}, embedding_model_path.c_str(), runtime_manager_, &module_config)); - MNN_PRINT("Done!\n"); - load_progress_ += step; - } - if (is_visual_) { - std::string visual_model_path = model_dir + "/visual.mnn"; - MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, visual_model_path.c_str());fflush(stdout); - module_config.rearrange = false; - visual_module_.reset(Module::load({}, {}, visual_model_path.c_str(), runtime_manager_, &module_config)); - MNN_PRINT("Done!\n"); - module_config.rearrange = true; - } - // load glm_block models - for (int i = 0; i < layer_nums_; i++) { - load_progress_ += step; - std::string model_path = model_dir + "/block_" + std::to_string(i) + ".mnn"; - MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, model_path.c_str()); - modules_[i].reset(Module::load( - {"inputs_embeds", "attention_mask", "position_ids", "past_key_values"}, - {"hidden_states", "presents"}, model_path.c_str(), runtime_manager_, &module_config)); - MNN_PRINT("Done!\n"); - } - } - if (config.type == MNN_FORWARD_OPENCL) { - // warmup(); - } - decode_modules_.resize(modules_.size()); - for (int v=0; v tmp(1, 0); - forward(tmp); - all_seq_len_ = 0; - gen_seq_len_ = 0; - printf("Done\n"); -} - -int Llm::forward(const std::vector& input_ids) { - int seq_len = input_ids.size(); - auto attention_mask = gen_attention_mask(seq_len); - auto position_ids = gen_position_ids(seq_len); - int id = -1; - if (is_single_) { - // single model - auto hidden_states = _Const(input_ids.data(), {seq_len}, NCHW, halide_type_of()); - if (is_disk_embedding_) { - hidden_states = embedding(input_ids); - } - auto outputs = modules_.back()->onForward({hidden_states, attention_mask, position_ids, past_key_values_[0]}); - ExecutorScope::Current()->gc(Executor::FULL); - id = outputs[0]->readMap()[0]; - past_key_values_[0] = outputs[1]; - } else { - // split block models - auto hidden_states = embedding(input_ids); - ExecutorScope::Current()->gc(Executor::FULL); - for (int i = 0; i < layer_nums_; i++) { - AUTOTIME; - auto outputs = modules_[i]->onForward({hidden_states, attention_mask, position_ids, past_key_values_[i]}); - hidden_states = outputs[0]; - past_key_values_[i] = outputs[1]; - } - ExecutorScope::Current()->gc(Executor::FULL); - { - AUTOTIME; - auto outputs = modules_[layer_nums_]->onForward({hidden_states}); - id = outputs[0]->readMap()[0]; - } - } - all_seq_len_ += seq_len; - gen_seq_len_++; - return id; -} - static inline bool needNewVar(VARP var, int axis, int seq_len) { if (var == nullptr) { return true; @@ -393,27 +385,23 @@ static inline bool needNewVar(VARP var, int axis, int seq_len) { return false; } -VARP Llm::txt_embedding(const std::vector& input_ids) { - if (!is_disk_embedding_) { - // using model forward - auto inputs_ids_ = _Const(input_ids.data(), {static_cast(input_ids.size())}, NCHW, halide_type_of()); - auto hidden_states = modules_[layer_nums_ + 1]->onForward({inputs_ids_})[0]; - return hidden_states; - } +VARP Llm::embedding(const std::vector& input_ids) { AUTOTIME; // disk embedding to save memory + int hidden_size = config_->hidden_size(); int seq_len = static_cast(input_ids.size()); if (needNewVar(inputs_embeds_, 0, seq_len)) { - inputs_embeds_ = _Input({seq_len, 1, hidden_size_}, NCHW); + inputs_embeds_ = _Input({seq_len, 1, hidden_size}, NCHW); } - size_t size = hidden_size_ * sizeof(int16_t); - FILE* file = fopen(disk_embedding_file_.c_str(), "rb"); - std::unique_ptr buffer(new int16_t[hidden_size_]); + + size_t size = hidden_size * sizeof(int16_t); + FILE* file = fopen(config_->embedding_file().c_str(), "rb"); + std::unique_ptr buffer(new int16_t[hidden_size]); for (size_t i = 0; i < seq_len; i++) { fseek(file, input_ids[i] * size, SEEK_SET); fread(buffer.get(), 1, size, file); - auto ptr = inputs_embeds_->writeMap() + i * hidden_size_ * 2; - for (int j = 0; j < hidden_size_; j++) { + auto ptr = inputs_embeds_->writeMap() + i * hidden_size * 2; + for (int j = 0; j < hidden_size; j++) { ptr[j * 2] = 0; ptr[j * 2 + 1] = buffer[j]; } @@ -422,31 +410,6 @@ VARP Llm::txt_embedding(const std::vector& input_ids) { return inputs_embeds_; } -void Llm::trace(bool start) { - auto status = MNN::Interpreter::Session_Resize_Check; - if (start) { - status = MNN::Interpreter::Session_Resize_Check; - } else { - status = MNN::Interpreter::Session_Resize_Fix; - } - for (auto& m : decode_modules_) { - m->traceOrOptimize(status); - } - runtime_manager_->updateCache(); -} - -VARP Llm::embedding(const std::vector& input_ids) { - if (is_visual_ && !gen_seq_len_) { - return visual_embedding(input_ids); - } - return txt_embedding(input_ids); -} - -std::vector Llm::tokenizer_encode(const std::string& input_str) { - auto ids = tokenizer_->encode(input_str); - return ids; -} - std::string Llm::decode(int id) { std::string word = tokenizer_->decode(id); // Fix utf-8 garbled characters @@ -457,159 +420,131 @@ std::string Llm::decode(int id) { return word; } -// Chatglm_6b -std::vector Chatglm_6b::tokenizer(const std::string& query) { - auto ids = tokenizer_encode(query); - context_len_ = ids.size(); - ids.push_back(130001); - ids.push_back(130004); - return ids; -} - -VARP Chatglm_6b::gen_attention_mask(int seq_len) { - auto attention_mask = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of()); - auto ptr = attention_mask->writeMap(); - for (int i = 0; i < seq_len * seq_len; i++) { - ptr[i] = 0; - } - if (seq_len > 1) { - for (int i = 1; i < seq_len; i++) { - ptr[seq_len * i - 1] = 1; +VARP Llm::gen_attention_mask(int seq_len) { + if (config_->attention_mask() == "float") { + if (needNewVar(attention_mask_, 2, seq_len)) { + attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of()); + } else { + return attention_mask_; } - } - return attention_mask; -} - -VARP Chatglm_6b::gen_position_ids(int seq_len) { - auto position_ids = _Input({1, 2, seq_len}, NCHW, halide_type_of()); - auto ptr = position_ids->writeMap(); - if (seq_len == 1) { - ptr[0] = 1; - ptr[1] = all_seq_len_ - context_len_; - } else { - for (int i = 0; i < seq_len; i++) { - ptr[i] = i; - ptr[seq_len + i] = 0; - } - ptr[2 * seq_len - 1] = 1; - } - return position_ids; -} - -bool Chatglm_6b::is_stop(int token_id) { - return token_id == 130005; -} - -// Chatglm2_6b -std::vector Chatglm2_6b::tokenizer(const std::string& query) { - auto prompt = "问:" + query + "\n答:"; - auto ids = tokenizer_encode(prompt); - if (history_.empty()) { - ids.insert(ids.begin(), 64792); - ids.insert(ids.begin(), 64790); - } - return ids; -} - -VARP Chatglm2_6b::gen_attention_mask(int seq_len) { - auto attention_mask = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of()); - auto ptr = attention_mask->writeMap(); - if (seq_len > 1) { + auto ptr = attention_mask_->writeMap(); for (int i = 0; i < seq_len; i++) { for (int j = 0; j < seq_len; j++) { - ptr[seq_len * i + j] = j > i; + ptr[seq_len * i + j] = (j > i) * std::numeric_limits::lowest(); } } + return attention_mask_; } else { - ptr[0] = 0; - } - return attention_mask; -} - -VARP Chatglm2_6b::gen_position_ids(int seq_len) { - auto position_ids = _Input({seq_len}, NCHW, halide_type_of()); - auto ptr = position_ids->writeMap(); - if (seq_len == 1) { - ptr[0] = gen_seq_len_; - } else { - for (int i = 0; i < seq_len; i++) { - ptr[i] = i; + if (needNewVar(attention_mask_, 2, seq_len)) { + attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of()); + } else { + return attention_mask_; } + auto ptr = attention_mask_->writeMap(); + if (config_->attention_mask() == "glm") { + // chatglm + for (int i = 0; i < seq_len * seq_len; i++) { + ptr[i] = 0; + } + if (seq_len > 1) { + for (int i = 1; i < seq_len; i++) { + ptr[seq_len * i - 1] = 1; + } + } + } else { + bool is_glm2 = config_->attention_mask() == "glm2"; + for (int i = 0; i < seq_len; i++) { + for (int j = 0; j < seq_len; j++) { + ptr[seq_len * i + j] = is_glm2 ? j > i : j <= i; + } + } + } + return attention_mask_; } - return position_ids; -} - -bool Chatglm2_6b::is_stop(int token_id) { - return token_id <= 2; -} - -// Phi_2 -std::vector Phi_2::tokenizer(const std::string& query) { - auto prompt = query; - auto ids = tokenizer_encode(prompt); - return ids; } -bool Phi_2::is_stop(int token_id) { - return token_id == 50256; -} - -// Qwen_7b -std::vector Qwen_7b::tokenizer(const std::string& query) { - auto ids = tokenizer_encode(query); - // auto prompt = "\n<|im_start|>user\n" + query + "<|im_end|>\n<|im_start|>assistant\n"; - ids.insert(ids.begin(), {198, 151644, 872, 198}); - ids.insert(ids.end(), {151645, 198, 151644, 77091, 198}); - return ids; -} - -VARP Qwen_7b::gen_attention_mask(int seq_len) { - if (needNewVar(attention_mask_, 2, seq_len)) { - attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of()); +VARP Llm::gen_position_ids(int seq_len) { + if (config_->attention_mask() == "glm") { + // chatglm + if (needNewVar(position_ids_, 2, seq_len)) { + position_ids_ = _Input({1, 2, seq_len}, NCHW, halide_type_of()); + } + auto ptr = position_ids_->writeMap(); + if (seq_len == 1) { + ptr[0] = all_seq_len_ - gen_seq_len_ - 2; + ptr[1] = gen_seq_len_ + 1; + } else { + for (int i = 0; i < seq_len - 1; i++) { + ptr[i] = i; + ptr[seq_len + i] = 0; + } + ptr[seq_len - 1] = seq_len - 2; + ptr[2 * seq_len - 1] = 1; + } + return position_ids_; } else { - return attention_mask_; - } - auto ptr = attention_mask_->writeMap(); - for (int i = 0; i < seq_len; i++) { - for (int j = 0; j < seq_len; j++) { - ptr[seq_len * i + j] = j <= i; + bool is_glm2 = config_->attention_mask() == "glm2"; + if (needNewVar(position_ids_, 0, seq_len)) { + position_ids_ = _Input({seq_len}, NCHW, halide_type_of()); } + auto ptr = position_ids_->writeMap(); + if (seq_len == 1) { + ptr[0] = is_glm2 ? gen_seq_len_ : all_seq_len_; + } else { + for (int i = 0; i < seq_len; i++) { + ptr[i] = i; + } + } + return position_ids_; } - return attention_mask_; } -VARP Qwen_7b::gen_position_ids(int seq_len) { - if (needNewVar(position_ids_, 0, seq_len)) { - position_ids_ = _Input({seq_len}, NCHW, halide_type_of()); - } - auto ptr = position_ids_->writeMap(); - if (seq_len == 1) { - ptr[0] = all_seq_len_; - } else { - for (int i = 0; i < seq_len; i++) { - ptr[i] = i; - } - } - return position_ids_; +bool Llm::is_stop(int token_id) { + return tokenizer_->is_stop(token_id); } -bool Qwen_7b::is_stop(int token_id) { - // <|endoftext|> <|im_end|> - return token_id == 151643 || token_id == 151645; +void Lvlm::load() { + Llm::load(); + Module::Config module_config; + module_config.shapeMutable = true; + module_config.rearrange = false; + visual_module_.reset(Module::load({}, {}, config_->visual_model().c_str(), runtime_manager_, &module_config)); } -// Qwen_vl -std::vector Qwen_vl::url_encode(const std::string& url) { - std::vector ascii_values(imgpad_len_, img_pad_); +std::vector Lvlm::url_encode(const std::string& url) { + std::vector ascii_values(imgpad_len_ + 2, img_pad_); ascii_values[0] = img_start_; - ascii_values[imgpad_len_ - 1] = img_end_; + ascii_values[imgpad_len_ + 1] = img_end_; for (int i = 0; i < url.size(); i++) { ascii_values[i + 1] = static_cast(url[i]); } return ascii_values; } -VARP Qwen_vl::visual_embedding(const std::vector& input_ids) { +std::vector Lvlm::tokenizer(const std::string& query) { + auto prompt = apply_chat_template(query); + // split query + std::regex img_regex("(.*?)"); + std::string::const_iterator searchStart(prompt.cbegin()); + std::smatch match; + std::vector img_info, txt_info; + std::vector ids {}; + while (std::regex_search(searchStart, prompt.cend(), match, img_regex)) { + std::cout << match[1].str() << std::endl; + auto txt_ids = tokenizer_->encode(match.prefix().str()); + ids.insert(ids.end(), txt_ids.begin(), txt_ids.end()); + auto img_ids = url_encode(match[1].str()); + ids.insert(ids.end(), img_ids.begin(), img_ids.end()); + searchStart = match.suffix().first; + } + if (searchStart != prompt.cend()) { + auto txt_ids = tokenizer_->encode(std::string(searchStart, prompt.cend())); + ids.insert(ids.end(), txt_ids.begin(), txt_ids.end()); + } + return ids; +} + +VARP Lvlm::embedding(const std::vector& input_ids) { #ifdef USING_VISUAL_MODEL int start_pos = 0, pad_pos = 0, end_pos = 0; for (int i = 0; i < input_ids.size(); i++) { @@ -625,11 +560,11 @@ VARP Qwen_vl::visual_embedding(const std::vector& input_ids) { } } if (!start_pos) { - return txt_embedding(input_ids); + return Llm::embedding(input_ids); } - std::vector prefix(input_ids.begin(), input_ids.begin() + start_pos); + std::vector prefix(input_ids.begin(), input_ids.begin() + start_pos + 1); std::vector img_ascii(input_ids.begin() + start_pos + 1, input_ids.begin() + pad_pos); - std::vector suffix(input_ids.begin() + end_pos + 1, input_ids.end()); + std::vector suffix(input_ids.begin() + end_pos, input_ids.end()); std::string img_path; for (auto ascii_val : img_ascii) { img_path += static_cast(ascii_val); @@ -671,192 +606,14 @@ VARP Qwen_vl::visual_embedding(const std::vector& input_ids) { image = MNN::Express::_Convert(image, NC4HW4); auto image_embedding = visual_module_->forward(image); image_embedding = MNN::Express::_Permute(image_embedding, {1, 0, 2}); - auto prefix_embedding = txt_embedding(prefix); - auto suffix_embedding = txt_embedding(suffix); + auto prefix_embedding = Llm::embedding(prefix); + auto suffix_embedding = Llm::embedding(suffix); auto embeddings = MNN::Express::_Concat({prefix_embedding, image_embedding, suffix_embedding}, 0); #else - auto embeddings = txt_embedding(input_ids); + auto embeddings = Llm::embedding(input_ids); #endif return embeddings; } - -std::vector Qwen_vl::tokenizer(const std::string& query) { - // split query - std::regex img_regex("(.*?)"); - std::string::const_iterator searchStart(query.cbegin()); - std::smatch match; - std::vector img_info, txt_info; - std::vector ids {}; - while (std::regex_search(searchStart, query.cend(), match, img_regex)) { - auto txt_ids = tokenizer_encode(match.prefix().str()); - ids.insert(ids.end(), txt_ids.begin(), txt_ids.end()); - auto img_ids = url_encode(match[1].str()); - ids.insert(ids.end(), img_ids.begin(), img_ids.end()); - searchStart = match.suffix().first; - } - if (searchStart != query.cend()) { - auto txt_ids = tokenizer_encode(std::string(searchStart, query.cend())); - ids.insert(ids.end(), txt_ids.begin(), txt_ids.end()); - } - // auto prompt = "\n<|im_start|>user\n" + query + "<|im_end|>\n<|im_start|>assistant\n"; - ids.insert(ids.begin(), {198, 151644, 872, 198}); - ids.insert(ids.end(), {151645, 198, 151644, 77091, 198}); - return ids; -} - -VARP Qwen_vl::gen_attention_mask(int seq_len) { - if (seq_len == 1) { - auto attention_mask = _Input({1, 1, 1, all_seq_len_ + 1}, NCHW, halide_type_of()); - auto ptr = attention_mask->writeMap(); - for (int i = 0; i < all_seq_len_ + 1; i++) { - ptr[i] = 0; - } - return attention_mask; - } else { - auto attention_mask = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of()); - auto ptr = attention_mask->writeMap(); - for (int i = 0; i < seq_len; i++) { - for (int j = 0; j < seq_len; j++) { - ptr[seq_len * i + j] = (j > i) * std::numeric_limits::lowest(); - } - } - return attention_mask; - } -} - -// Llama2_7b -std::vector Llama2_7b::tokenizer(const std::string& query) { - auto ids = tokenizer_encode(query); - if (model_name_ == "Baichuan2_7b") { - // baichuan2: {query}: 195, query, 196 - ids.insert(ids.begin(), 195); - ids.push_back(196); - return ids; - } - if (model_name_ == "Internlm_7b") { - // internlm: "<|User|>:" + query + "\n<|Bot|>:"; - // 1, 333, 352, 1621, 352, 27232, query, 103027, 364, 333, 352, 23845, 352, 27232 - ids.insert(ids.begin(), {1, 333, 352, 1621, 352, 27232}); - ids.insert(ids.end(), {103027, 364, 333, 352, 23845, 352, 27232}); - return ids; - } - if (model_name_ == "deepseek_7b") { - // "<|begin▁of▁sentence|>User:" + query + "\n\nAssistant:" - ids.insert(ids.begin(), {100000, 5726, 25, 207}); - ids.insert(ids.end(), {185, 185, 77398, 25}); - return ids; - } - // llama2: [INST]{query}[/INST]: 1, 5539, 25580, 29962, query, 12452, 25580, 29962 - ids.insert(ids.begin(), {1, 5539, 25580, 29962}); - ids.insert(ids.end(), {12452, 25580, 29962}); - return ids; -} - -VARP Llama2_7b::gen_attention_mask(int seq_len) { - if (needNewVar(attention_mask_, 2, seq_len)) { - attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of()); - } else { - return attention_mask_; - } - auto ptr = attention_mask_->writeMap(); - for (int i = 0; i < seq_len; i++) { - for (int j = 0; j < seq_len; j++) { - ptr[seq_len * i + j] = (j > i) * std::numeric_limits::lowest(); - } - } - return attention_mask_; -} - -VARP Llama2_7b::gen_position_ids(int seq_len) { - if (needNewVar(position_ids_, 1, seq_len)) { - position_ids_ = _Input({1, seq_len}, NCHW, halide_type_of()); - } - auto ptr = position_ids_->writeMap(); - if (seq_len == 1) { - ptr[0] = all_seq_len_; - } else { - for (int i = 0; i < seq_len; i++) { - ptr[i] = i; - } - } - return position_ids_; -} - -bool Llama2_7b::is_stop(int token_id) { - if (model_name_ == "Internlm_7b") { - // 103028: - return token_id == 2 || token_id == 103028; - } - if (model_name_ == "deepseek_7b") { - return token_id == 100001; - } - return token_id == 2; -} - -std::vector MiniCPM_1_2b::tokenizer(const std::string& query) { - auto ids = tokenizer_encode(query); - // auto prompt = "<用户>" + query + ""; - ids.insert(ids.begin(), {59396, 4194, 59388}); - ids.insert(ids.end(), {59396, 10850, 59388}); - return ids; -} - -std::vector MiniCPM_2_4b::tokenizer(const std::string& query) { - auto ids = tokenizer_encode(query); - // auto prompt = "<用户>" + query + ""; - ids.insert(ids.begin(), {95396, 4194, 95388}); - ids.insert(ids.end(), {95396, 10850, 95388}); - return ids; -} - -std::vector Qwen2::tokenizer(const std::string& query) { - auto ids = tokenizer_encode(query); - // auto prompt = "<|im_start|>user\n" + query + "<|im_end|>\n<|im_start|>assistant\n"; - ids.insert(ids.begin(), {151644, 872, 198}); - ids.insert(ids.end(), {151645, 198, 151644, 77091, 198}); - return ids; -} - -bool Qwen2::is_stop(int token_id) { - return token_id == 151645 || token_id == 151643; -} - -std::vector TinyLlama::tokenizer(const std::string& query) { - auto ids = tokenizer_encode(query); - /* - <|system|> - You are a friendly chatbot who always responds in the style of a pirate - <|user|> - {query} - <|assistant|> - */ - ids.insert(ids.begin(), {1, 529, 29989, 5205, 29989, 29958, 13, 3492, 526, 263, 19780, 13563, - 7451, 1058, 2337, 10049, 29879, 297, 278, 3114, 310, 263, 21625, - 403, 2, 29871, 13, 29966, 29989, 1792, 29989, 29958, 13}); - ids.insert(ids.end(), {2, 29871, 13, 29966, 29989, 465, 22137, 29989, 29958, 13}); - return ids; -} - -std::vector Yi_6b::tokenizer(const std::string& query) { - auto prompt = "<|im_start|> user\n" + query + "<|im_end|>\n<|im_start|> assistant\n"; - auto ids = tokenizer_encode(prompt); - return ids; -} - -bool Yi_6b::is_stop(int token_id) { - return token_id == 7 || token_id == 64001; -} -std::vector Llama3_8b::tokenizer(const std::string& query) { - // <|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n+query+<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n - auto ids = tokenizer_encode(query); - ids.insert(ids.begin(), {128000, 128006, 882, 128007, 271}); - ids.insert(ids.end(), {128009, 128006, 78191, 128007, 271}); - return ids; -} - -bool Llama3_8b::is_stop(int token_id) { - return token_id == 128001 || token_id == 128009; -} // Llm end // Embedding start @@ -866,54 +623,28 @@ float Embedding::dist(VARP var0, VARP var1) { return dist; } -Embedding* Embedding::createEmbedding(const std::string& path, std::string model_type) { - auto size = path.size(); - - Embedding* embedding = nullptr; - if (model_type == "auto") { - model_type = path; - } - if (model_type.find("bge") != std::string::npos) { - embedding = new Bge; - } - if (!embedding) { - std::cerr << "model type can't judge!" << std::endl; - return embedding; - } - std::cout << "### model name : "<< embedding->model_name_ << std::endl; - embedding->load(path); +Embedding* Embedding::createEmbedding(const std::string& config_path) { + std::shared_ptr config(new LlmConfig(config_path)); + Embedding* embedding = new Embedding(config); + embedding->load(); return embedding; } -void Embedding::load(const std::string& model_dir) { - if (model_dir_ == model_dir) { - return; - } - model_dir_ = model_dir; - // init - ScheduleConfig config; - BackendConfig cpuBackendConfig; - config.type = MNN_FORWARD_CPU; - // config.type = MNN_FORWARD_OPENCL; - config.numThread = 4; - cpuBackendConfig.precision = BackendConfig::Precision_Low; - cpuBackendConfig.memory = BackendConfig::Memory_Low; - config.backendConfig = &cpuBackendConfig; - runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config)); +void Embedding::load() { + init_runtime(); printf("load tokenizer\n"); + std::cout << config_->tokenizer_file() << std::endl; // 1. load vocab - size_t pos = model_dir.find_last_of("/\\"); - std::string dir_path = (pos != std::string::npos) ? model_dir.substr(0, pos + 1) : ""; - std::string tokenizer_path = dir_path + "/tokenizer.txt"; - tokenizer_->load(tokenizer_path); + tokenizer_.reset(Tokenizer::createTokenizer(config_->tokenizer_file())); printf("load tokenizer Done\n"); // 2. load model Module::Config module_config; module_config.shapeMutable = true; module_config.rearrange = true; - std::string model_path = model_dir; + auto model_path = config_->llm_model(); MNN_PRINT("load %s ... ", model_path.c_str()); - module_.reset(Module::load( + modules_.resize(1); + modules_[0].reset(Module::load( {"input_ids", "attention_mask", "position_ids"}, {"sentence_embeddings"}, model_path.c_str(), runtime_manager_, &module_config)); MNN_PRINT("Done!\n"); @@ -921,45 +652,26 @@ void Embedding::load(const std::string& model_dir) { VARP Embedding::embedding(const std::string& txt) { auto ids = tokenizer(txt); - prompt_len_ = ids.size(); - auto inputs_ids = _Const(ids.data(), {prompt_len_}, NCHW, halide_type_of()); - auto attention_mask = gen_attention_mask(prompt_len_); - auto position_ids = gen_position_ids(prompt_len_); - auto st = std::chrono::system_clock::now(); - auto outputs = module_->onForward({inputs_ids, attention_mask, position_ids}); - auto et = std::chrono::system_clock::now(); - embedding_us_ = std::chrono::duration_cast(et - st).count(); + int prompt_len = ids.size(); + auto inputs_ids = _Const(ids.data(), {prompt_len}, NCHW, halide_type_of()); + auto attention_mask = gen_attention_mask(prompt_len); + auto position_ids = gen_position_ids(prompt_len); + auto outputs = modules_[0]->onForward({inputs_ids, attention_mask, position_ids}); auto sentence_embeddings = outputs[0]; - // print_speed(); return sentence_embeddings; } -void Embedding::print_speed() { - auto total_s = embedding_us_ * 1e-6; - printf("\n#################################\n"); - printf(" total token = %d\n", prompt_len_); - printf(" total time = %.2f s\n", total_s); - printf(" total speed = %.2f tok/s\n", prompt_len_ / total_s); - printf("##################################\n"); -} - -std::vector Embedding::tokenizer_encode(const std::string& input_str) { - auto ids = tokenizer_->encode(input_str); - return ids; -} - -std::vector Bge::tokenizer(const std::string& query) { +std::vector Embedding::tokenizer(const std::string& query) { auto prompt = query; if (query.size() <= 256) { prompt = "为这个句子生成表示以用于检索相关文章:" + query; } - auto ids = tokenizer_encode(prompt); - ids.insert(ids.begin(), 101); - ids.push_back(102); + prompt = apply_chat_template(prompt); + auto ids = tokenizer_->encode(prompt); return ids; } -VARP Bge::gen_attention_mask(int seq_len) { +VARP Embedding::gen_attention_mask(int seq_len) { auto attention_mask = _Input({1, 1, 1, seq_len}, NCHW, halide_type_of()); auto ptr = attention_mask->writeMap(); for (int i = 0; i < seq_len; i++) { @@ -968,7 +680,7 @@ VARP Bge::gen_attention_mask(int seq_len) { return attention_mask; } -VARP Bge::gen_position_ids(int seq_len) { +VARP Embedding::gen_position_ids(int seq_len) { auto position_ids = _Input({1, seq_len}, NCHW, halide_type_of()); auto ptr = position_ids->writeMap(); for (int i = 0; i < seq_len; i++) { diff --git a/transformers/llm/engine/src/tokenizer.cpp b/transformers/llm/engine/src/tokenizer.cpp index fb71443f0..d6f6490c8 100644 --- a/transformers/llm/engine/src/tokenizer.cpp +++ b/transformers/llm/engine/src/tokenizer.cpp @@ -78,18 +78,131 @@ static inline void to_lower_case(std::string& str) { } } -bool Sentencepiece::load(const std::string& filename) { +Tokenizer* Tokenizer::createTokenizer(const std::string& filename) { + Tokenizer* tokenizer = nullptr; + // check file std::ifstream tok_file(filename); + if (!tok_file.good()) { + printf("Failed: can't load tokenzier from: %s.\n", filename.c_str()); + return tokenizer; + } + // check tokenizer info + std::string line; + std::getline(tok_file, line); + std::istringstream line_str(line); + int magic_number, tokenizer_type; + line_str >> magic_number; + if (magic_number != MAGIC_NUMBER) { + printf("Failed: magic number is wrong from: %s.\n", filename.c_str()); + return tokenizer; + } + line_str >> tokenizer_type; + printf("tokenizer_type = %d\n", tokenizer_type); + // create tokenizer + switch (tokenizer_type) + { + case SENTENCEPIECE: + tokenizer = new Sentencepiece(); + break; + case TIKTOIKEN: + tokenizer = new Tiktoken(); + break; + case BERT: + tokenizer = new BertTokenizer(); + break; + case HUGGINGFACE: + tokenizer = new HuggingfaceTokenizer(); + break; + default: + return tokenizer; + } + // load special tokens + tokenizer->load_special(tok_file); + // load vocabs + tokenizer->load_vocab(tok_file); + tok_file.close(); + return tokenizer; +} + +bool Tokenizer::is_stop(int token) { + return std::find(stop_tokens_.begin(), stop_tokens_.end(), token) != stop_tokens_.end(); +} + +void Tokenizer::load_special(std::ifstream& tok_file) { + std::string line; + std::getline(tok_file, line); + std::istringstream line_str(line); + int special_num, stop_num, prefix_num; + line_str >> special_num >> stop_num >> prefix_num; + std::getline(tok_file, line); + std::istringstream specail_line(line); + if (special_num) { + // load special tokens + special_tokens_.resize(special_num); + for (int i = 0; i < special_num; i++) { + specail_line >> special_tokens_[i]; + } + } + if (stop_num) { + // load stop tokens + stop_tokens_.resize(stop_num); + for (int i = 0; i < stop_num; i++) { + specail_line >> stop_tokens_[i]; + } + } + if (prefix_num) { + // load prefix tokens + prefix_tokens_.resize(prefix_num); + for (int i = 0; i < prefix_num; i++) { + specail_line >> prefix_tokens_[i]; + } + } +} + +std::vector Tokenizer::encode(const std::string& str) { + std::vector ids = prefix_tokens_; + if (!special_tokens_.empty()) { + std::string text = str; + size_t start = 0; + for (size_t i = 0; i < text.length(); ++i) { + for (auto special_id : special_tokens_) { + const auto& token = decode(special_id); + if (token.empty()) continue; + if (i + token.length() <= text.length() && text.substr(i, token.length()) == token) { + if (i > start) { + encode(text.substr(start, i - start), ids); + } + ids.push_back(special_id); + start = i + token.length(); + i = start - 1; + break; + } + } + } + if (start < text.length()) { + encode(text.substr(start), ids); + } + } else { + encode(str, ids); + } + return ids; +} + +bool Sentencepiece::load_vocab(std::ifstream& tok_file) { std::string line, token; + std::getline(tok_file, line); + int vocab_len = std::stoi(line); float score; - int index = 0, type; - while (std::getline(tok_file, line)) { + int type; + sentence_pieces_.resize(vocab_len); + for (int index = 0; index < vocab_len; index++) { + std::getline(tok_file, line); std::istringstream line_str(line); line_str >> token >> score >> type; token = base64_decode(token); auto piece_type = static_cast(type); SentencePiece piece {token, score, piece_type}; - sentence_pieces_.emplace_back(std::move(piece)); + sentence_pieces_[index] = std::move(piece); if (piece_type == PieceType::NORMAL) { pieces_.insert({token, index}); } else { @@ -98,9 +211,7 @@ bool Sentencepiece::load(const std::string& filename) { unk_id_ = index; } } - index++; } - tok_file.close(); return true; } @@ -270,8 +381,7 @@ Sentencepiece::EncodeResult Sentencepiece::bpe_encode(std::string_view normalize return output; } -std::vector Sentencepiece::encode(const std::string& str) { - std::vector ids; +void Sentencepiece::encode(const std::string& str, std::vector& ids) { auto result = bpe_encode(str); size_t consumed = 0; for (const auto &p : result) { @@ -291,7 +401,6 @@ std::vector Sentencepiece::encode(const std::string& str) { ids.push_back(id); } } - return ids; } std::string Sentencepiece::decode(int id) { @@ -315,26 +424,24 @@ bool Sentencepiece::is_control(int id) const { return sentence_pieces_[id].type == PieceType::CONTROL; } -bool Tiktoken::load(const std::string& filename) { - std::ifstream tok_file(filename); - if (!tok_file.good()) { - printf("Failed: can't load tokenzier from: %s.\n", filename.c_str()); - return false; - } - std::string token; - while (tok_file >> token) { - token = base64_decode(token); - encoder_[token] = static_cast(decoder_.size()); - decoder_.push_back(token); +bool Tiktoken::load_vocab(std::ifstream& tok_file) { + std::string line; + std::getline(tok_file, line); + int vocab_len = std::stoi(line); + // load vocab + decoder_.resize(vocab_len); + for (int i = 0; i < vocab_len; i++) { + std::getline(tok_file, line); + auto token = base64_decode(line); + encoder_.insert({token, i}); + decoder_[i] = token; } - tok_file.close(); return true; } -std::vector Tiktoken::encode(const std::string& str) { - std::vector ids; +void Tiktoken::encode(const std::string& str, std::vector& ids) { if (str.empty()) { - return ids; + return; } size_t i = 0; while (i < str.size()) { @@ -362,10 +469,9 @@ std::vector Tiktoken::encode(const std::string& str) { // If no matching symbol is found, this typically means an error in the encoding // or the input text contains characters that the encoder doesn't know how to handle std::cerr << "Error: No encoding found for the sequence starting at position " << i << std::endl; - return {}; + return; } } - return ids; } std::string Tiktoken::decode(int id) { @@ -409,8 +515,7 @@ std::vector BertTokenizer::word_piece(const std::string& token) { return ids; } -std::vector BertTokenizer::encode(const std::string& str) { - std::vector ids; +void BertTokenizer::encode(const std::string& str, std::vector& ids) { std::vector tokens; std::string current_token; size_t i = 0; @@ -460,7 +565,6 @@ std::vector BertTokenizer::encode(const std::string& str) { ids.push_back(id); } } - return ids; } std::wstring utf8_to_wstring(const std::string& str) { @@ -484,8 +588,7 @@ void byte_encode_token(const std::string& token, } } -bool HuggingfaceTokenizer::load(const std::string& filename) { - std::ifstream tok_file(filename); +bool HuggingfaceTokenizer::load_vocab(std::ifstream& tok_file) { std::string line, token; // get nums int vocab_len, merge_len; @@ -506,7 +609,6 @@ bool HuggingfaceTokenizer::load(const std::string& filename) { bpe_ranks_.insert({{utf8_to_wstring(line.substr(0, d)), utf8_to_wstring(line.substr(d + 1))}, i}); } - tok_file.close(); // bytes_to_unicode auto _insert_range = [=](int start, int end) { for (int c = start; c <= end; c++) { @@ -601,8 +703,8 @@ void HuggingfaceTokenizer::bpe(const std::wstring& token, const BPERanks& bpe_ra } } -std::vector HuggingfaceTokenizer::encode(const std::string& str) { - std::regex re("('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?(_+)| ?[[:digit:]]+| ?[^\\s\\w]+|\\s+)"); +void HuggingfaceTokenizer::encode(const std::string& str, std::vector& ids) { + std::regex re("('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\\s\\w]+|\\s+)"); std::string input = str; std::vector result; std::string token; @@ -622,21 +724,22 @@ std::vector HuggingfaceTokenizer::encode(const std::string& str) { result.push_back(wstring_to_utf8(ws)); } } - std::vector ids; for (auto s : result) { ids.push_back(encoder_.at(s)); } - return ids; } std::string HuggingfaceTokenizer::decode(int id) { + // printf("decode id = %d, %lu, %s#\n", id, decoder_.size(), decoder_.at(id).c_str()); if (id >= decoder_.size()) { return ""; } std::wstring w = utf8_to_wstring(decoder_.at(id)); std::string r; for (wchar_t c : w) { - r.push_back(char(u2b_.at(c))); + if (u2b_.find(c) != u2b_.end()) { + r.push_back(char(u2b_.at(c))); + } } return r; } diff --git a/transformers/llm/export/llm_export.py b/transformers/llm/export/llm_export.py index 08a644455..4b541b247 100644 --- a/transformers/llm/export/llm_export.py +++ b/transformers/llm/export/llm_export.py @@ -1,6 +1,7 @@ import os import base64 import glob +import json import shutil import argparse import torch @@ -66,8 +67,8 @@ def __init__(self, lm): def forward(self, hidden_states): m_logits = self.lm(hidden_states) - token = torch.argmax(m_logits) - return token + # token = torch.argmax(m_logits) + return m_logits class LLM(torch.nn.Module): ''' @@ -90,16 +91,19 @@ def __init__(self, args): # default is False, just set True when using below command: # `python llm_export ../path --export --embed_bin` to export single model without embedding self.without_embed = False - self.embed_bin = args.embed_bin - if self.embed_bin: - self.embed_bf16 = True - else: - self.embed_bf16 = args.embed_bf16 + self.embed_bin = True + self.embed_bf16 = args.embed_bf16 self.skip_slim = args.skip_slim tokenizer_model = os.path.join(args.path, 'tokenizer.model') - if os.path.exists(tokenizer_model): - self.sp_model = spm.SentencePieceProcessor(tokenizer_model) - else: + ice_text_model = os.path.join(args.path, 'ice_text.model') + try: + if os.path.exists(tokenizer_model): + self.sp_model = spm.SentencePieceProcessor(tokenizer_model) + elif os.path.exists(ice_text_model): + self.sp_model = spm.SentencePieceProcessor(ice_text_model) + else: + self.sp_model = None + except: self.sp_model = None merge_file = os.path.join(args.path, 'merges.txt') if os.path.exists(merge_file): @@ -113,10 +117,21 @@ def __init__(self, args): self.lora_path = args.lora_path self.load_hf(args.path) self.load_model() + self.llm_config = { + 'hidden_size' : self.hidden_size, + 'layer_nums' : self.block_nums, + 'attention_mask': self.attention_mask_type, + 'key_value_shape': self.past_kv_shape[1:], + "prompt_template": self.build_prompt('%s'), + 'is_visual': False + } def load_hf(self, model_path: str): self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).float().eval() + try: + self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).float().eval() + except: + self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True).float().eval() self.config = self.model.config if self.lora_path is not None: adapter = PeftModel.from_pretrained(self.model, model_id=self.lora_path) @@ -149,11 +164,11 @@ def __decode(self, hidden_states, attention_mask, position_ids, past_key_values) for i in range(self.block_nums): hidden_states, kv = self.blocks[i](hidden_states, attention_mask, position_ids, past_key_values[i]) presents.append(kv) - token_id = self.lm(hidden_states).view(1) + logits = self.lm(hidden_states).reshape(-1) presents = torch.stack(presents) self.seq_len += 1 self.token_len += 1 - return token_id, presents + return logits, presents def forward(self, input_ids, attention_mask, position_ids, past_key_values): if self.without_embed: @@ -188,8 +203,9 @@ def response(self, query): while self.token_len < self.max_length: attention_mask = self.get_attention_mask() position_ids = self.get_position_ids() - token_id, past_key_values = self.forward(token_id, attention_mask, position_ids, past_key_values) - if token_id == self.stop_id or token_id in self.stop_ids: + logits, past_key_values = self.forward(token_id, attention_mask, position_ids, past_key_values) + token_id = torch.argmax(logits) + if token_id in self.stop_ids: print("", end='\n') break word = self.id_to_str(token_id) @@ -218,7 +234,7 @@ def export_lm(self): onnx_model, verbose=self.export_verbose, input_names=['hidden_states'], - output_names=['token_id'], + output_names=['logits'], do_constant_folding=True, opset_version=15) if not self.skip_slim: @@ -272,7 +288,7 @@ def export_embed(self): tensor_data = model.embed.weight.data data_ptr = tensor_data.untyped_storage().data_ptr() buffer = (ctypes.c_byte * (tensor_data.numel() * 2)).from_address(data_ptr) - with open(f'./{self.mnn_path}/embeddings_bf16.bin', 'wb') as f: + with open(f'./{self.onnx_path}/embeddings_bf16.bin', 'wb') as f: f.write(buffer) return input_ids = torch.arange(3, dtype=torch.long) @@ -341,6 +357,11 @@ def export_blocks(self): for i in range(self.block_nums): self.export_block(i) + def export_config(self, is_single = True): + self.llm_config['is_single'] = is_single + with open(f'./{self.onnx_path}/llm_config.json', 'w', encoding='utf-8') as f: + json.dump(self.llm_config, f, ensure_ascii=False, indent=4) + def export(self): model = self self.seq_len = 3 @@ -361,13 +382,23 @@ def export(self): input_names=[ 'input_ids', 'attention_mask', 'position_ids', 'past_key_values' ], - output_names=['token_id', 'presents'], + output_names=['logits', 'presents'], dynamic_axes=self.model_dynamic_axes, do_constant_folding=True, opset_version=15) print('export done!') if not self.skip_slim: slim(onnx_model, output_model=onnx_model) + for file_path in glob.glob(f'./{self.onnx_path}/onnx__*'): + try: + os.remove(file_path) + except FileNotFoundError: + pass + for file_path in glob.glob(f'./{self.onnx_path}/model.*'): + try: + os.remove(file_path) + except FileNotFoundError: + pass if self.export_test: # test original_outs = model(input_ids, attention_mask, position_ids, past_key_values) @@ -387,13 +418,36 @@ def export(self): self.without_embed = False def export_tokenizer(self): + # TOKENIZER MAGIC NUMBER + MAGIC_NUMBER = 430 + # TOKENIZER TYPE + SENTENCEPIECE = 0; TIKTOIKEN = 1; BERT = 2; HUGGINGFACE = 3 + def write_line(fp, *args): + for arg in args: + for token in arg: + fp.write(str(token) + ' ') + fp.write('\n') + def write_header(fp, type, speicals, prefix = []): + fp.write(f'{MAGIC_NUMBER} {type}\n') + fp.write(f'{len(speicals)} {len(self.stop_ids)} {len(prefix)}\n') + write_line(fp, speicals, self.stop_ids, prefix) + file_path = os.path.join(self.onnx_path, "tokenizer.txt") + special_list = list(self.tokenizer.added_tokens_decoder.keys()) + if hasattr(self.tokenizer, 'special_tokens'): + for k, v in self.tokenizer.special_tokens.items(): + special_list.append(v) + if hasattr(self.tokenizer, 'gmask_token_id'): + special_list.append(self.tokenizer.gmask_token_id) + vocab_list = [] + prefix_list = [] + if hasattr(self.tokenizer, 'get_prefix_tokens'): + prefix_list = self.tokenizer.get_prefix_tokens() if self.sp_model is not None: # senetencepiece print('# senetencepiece tokenier') NORMAL = 1; UNKNOWN = 2; CONTROL = 3 USER_DEFINED = 4; UNUSED = 5; BYTE = 6 - fp = open(file_path, "w", encoding="utf8") for i in range(self.sp_model.GetPieceSize()): token = self.sp_model.IdToPiece(i) score = self.sp_model.GetScore(i) @@ -412,23 +466,37 @@ def export_tokenizer(self): if '<|blank_' in token: token = ' ' * int(token[8:token.find('|>')]) if '▁' in token: token = token.replace('▁', ' ') token_encode = base64.b64encode(token.encode("utf-8")).decode("utf8") - fp.write(f'{token_encode} {score} {type}\n') - fp.close() + vocab_list.append(f'{token_encode} {score} {type}\n') + with open(file_path, "w", encoding="utf8") as fp: + write_header(fp, SENTENCEPIECE, special_list, prefix_list) + fp.write(f'{len(vocab_list)}\n') + for vocab in vocab_list: + fp.write(vocab) elif hasattr(self.tokenizer, 'mergeable_ranks'): print('# tiktoken tokenier') # tikton + vocab_list = [] + for k, v in self.tokenizer.mergeable_ranks.items(): + line = base64.b64encode(k).decode("utf8") + "\n" + vocab_list.append(line) + if hasattr(self.tokenizer, 'special_tokens'): + for k, v in self.tokenizer.special_tokens.items(): + line = base64.b64encode(k.encode("utf-8")).decode("utf8") + "\n" + vocab_list.append(line) + if hasattr(self.tokenizer, 'added_tokens_decoder'): + for k, v in self.tokenizer.added_tokens_decoder.items(): + line = base64.b64encode(v.__str__().encode("utf-8")).decode("utf8") + "\n" + vocab_list.append(line) with open(file_path, "w", encoding="utf8") as fp: - for k, v in self.tokenizer.mergeable_ranks.items(): - line = base64.b64encode(k).decode("utf8") + "\n" - fp.write(line) - if hasattr(self.tokenizer, 'special_tokens'): - for k, v in self.tokenizer.special_tokens.items(): - line = base64.b64encode(k.encode("utf-8")).decode("utf8") + "\n" - fp.write(line) + write_header(fp, TIKTOIKEN, special_list, prefix_list) + fp.write(f'{len(vocab_list)}\n') + for vocab in vocab_list: + fp.write(vocab) elif self.merge_txt is not None: # huggingface tokenizer merge_list = [] vocab = self.tokenizer.get_vocab() + special_list = list(self.tokenizer.added_tokens_decoder.keys()) vocab_list = ['' for i in range(len(vocab))] # load vocab for k, v in vocab.items(): @@ -439,13 +507,15 @@ def export_tokenizer(self): merge_list.append(line) # write to tokenizer.txt with open(file_path, "w", encoding="utf8") as fp: + write_header(fp, HUGGINGFACE, special_list) fp.write(f'{len(vocab_list)} {len(merge_list)}\n') for v in vocab_list: fp.write(v + '\n') for m in merge_list: fp.write(m) else: - # huggingface tokenizer + print('# other tiktoken tokenier') + # other tikton def unicode_to_byte(u: int): if u >= 256 and u <= 288: return u - 256 @@ -458,25 +528,28 @@ def unicode_to_byte(u: int): if u == 9601: # _ return 95 return u + vocab = self.tokenizer.get_vocab() + vocab_list = ['' for i in range(len(vocab))] + for k, v in vocab.items(): + try: + vocab_list[int(v)] = bytes([unicode_to_byte(ord(c)) for c in k]).decode('utf-8', errors='ignore') + except: + vocab_list[int(v)] = k + special_list = list(self.tokenizer.added_tokens_decoder.keys()) with open(file_path, "w", encoding="utf8") as fp: - vocab = self.tokenizer.get_vocab() - vocab_list = ['' for i in range(len(vocab))] - for k, v in vocab.items(): - try: - vocab_list[int(v)] = bytes([unicode_to_byte(ord(c)) for c in k]).decode('utf-8', errors='ignore') - except: - vocab_list[int(v)] = k + write_header(fp, TIKTOIKEN, special_list) + fp.write(f'{len(vocab_list)}\n') for v in vocab_list: line = base64.b64encode(v.encode('utf-8')).decode("utf8") + "\n" fp.write(line) - # chatglm class GLMBlock(torch.nn.Module): def __init__(self, block, block_id, final_layernorm = None): super().__init__() self.block = block self.block_id = block_id + self.hidden_size = 4096 self.final_layernorm = final_layernorm def forward(self, hidden_states, attention_mask, position_ids, past_kv): @@ -495,8 +568,9 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv): class Chatglm_6b(LLM): def __init__(self, args): - super().__init__(args) + self.attention_mask_type = 'glm' self.model_name = 'Chatglm_6b' + super().__init__(args) def load_model(self): transformer = self.model.transformer @@ -505,7 +579,7 @@ def load_model(self): self.blocks_ = transformer.layers self.final_layernorm_ = transformer.final_layernorm # some wrapper - self.stop_id = self.tokenizer._convert_token_to_id(self.tokenizer.eos_token) + self.stop_ids.append(self.tokenizer._convert_token_to_id(self.tokenizer.eos_token)) self.block_nums = len(self.blocks_) self.lm = Lm(self.lm_) # chatglm embedding and lm using same param, copy embedding when using bf16 @@ -535,31 +609,38 @@ def get_attention_mask(self) -> torch.Tensor: if self.token_len: return torch.zeros([1]).bool().reshape([1, 1, 1, 1]) attention_mask = torch.zeros([self.seq_len, self.seq_len], dtype=torch.bool) - for i in range(self.seq_len): + for i in range(self.seq_len - 1): attention_mask[i][-1] = True attention_mask = attention_mask.reshape([1, 1, self.seq_len, self.seq_len]) return attention_mask def get_position_ids(self) -> torch.Tensor: if self.token_len: - return torch.tensor([1, self.seq_len - self.context_len]).reshape([1, 2, 1]) + return torch.tensor([self.context_len, self.token_len + 1]).reshape([1, 2, 1]) position_ids_0 = torch.arange(self.seq_len, dtype=torch.long) position_ids_1 = torch.zeros(self.seq_len, dtype=torch.long) + position_ids_0[-1] = position_ids_0[-2] position_ids_1[-1] = 1 position_ids = torch.stack([position_ids_0, position_ids_1]).view(1, 2, -1) return position_ids + def build_prompt(self, query): + return f'{query}[gMASK]' + # chatglm2 class GLM2Block(torch.nn.Module): - def __init__(self, block, block_id, final_layernorm = None): + def __init__(self, block, block_id, config, final_layernorm = None): super().__init__() self.block = block self.block_id = block_id self.final_layernorm = final_layernorm + self.config = config self.hidden_size = 4096 def forward(self, hidden_states, attention_mask, position_ids, past_kv): - theta = 1.0 / (10000 ** (torch.arange(0, 64, 2, dtype=torch.float32) / 64)) + rope_ratio = self.config.rope_ratio + base = 10000 * rope_ratio + theta = 1.0 / (base ** (torch.arange(0, 64, 2, dtype=torch.float32) / 64)) position_ids = position_ids.float().reshape(-1, 1) idx_theta = position_ids * theta rotary_pos_emb = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1).unsqueeze(0).contiguous() @@ -576,6 +657,7 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv): class Chatglm2_6b(LLM): def __init__(self, args): + self.attention_mask_type = 'glm2' super().__init__(args) self.model_name = 'Chatglm2_6b' if 'codegeex2-6b' in args.path: @@ -588,14 +670,21 @@ def load_model(self): self.blocks_ = transformer.encoder.layers self.final_layernorm_ = transformer.encoder.final_layernorm # some wrapper - self.stop_id = self.tokenizer.eos_token_id - if self.stop_id is None: + if self.tokenizer.eos_token_id is None: # codegeex2-6b - self.stop_id = self.tokenizer.tokenizer.eos_id + self.stop_ids.append(self.tokenizer.tokenizer.eos_id) + else: + self.stop_ids.append(self.tokenizer.eos_token_id) + if hasattr(self.config, 'eos_token_id'): + if type(self.config.eos_token_id) is list: + for eos_id in self.config.eos_token_id: + self.stop_ids.append(eos_id) + elif type(self.config.eos_token_id) is int: + self.stop_ids.append(self.config.eos_token_id) self.block_nums = len(self.blocks_) self.embed = Embedding(self.embed_, self.embed_bf16) self.lm = Lm(self.lm_) - self.blocks = [GLM2Block(self.blocks_[i], i, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)] + self.blocks = [GLM2Block(self.blocks_[i], i, self.config, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)] # some config for export self.past_kv_shape = [28, 2, 0, 1, 2, 128] self.block_dynamic_axes = { @@ -610,6 +699,21 @@ def load_model(self): "position_ids" : { 0: "seq_len" }, "past_key_values" : { 2: "history_len" } } + num_layers = self.config.num_layers + if num_layers > 28: + self.past_kv_shape = [num_layers, 2, 1, 2, 0, 128] + self.block_dynamic_axes = { + "inputs_embeds" : { 0: "seq_len" }, + "attention_mask" : { 2: "seq_len", 3: "seq_len" }, + "position_ids" : { 0: "seq_len" }, + "past_key_values" : { 3: "history_len" } + } + self.model_dynamic_axes = { + "input_ids" : { 0: "seq_len" }, + "attention_mask" : { 2: "seq_len", 3: "seq_len" }, + "position_ids" : { 0: "seq_len" }, + "past_key_values" : { 4: "history_len" } + } def get_attention_mask(self) -> torch.Tensor: if self.token_len: @@ -691,7 +795,17 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv): class Qwen_Chat(LLM): def __init__(self, args): + self.attention_mask_type = 'int' super().__init__(args) + if 'VL' in self.model_name: + self.llm_config['is_visual'] = True + self.llm_config['attention_mask'] = 'float' + self.llm_config['img_size'] = 448 + self.llm_config['imgpad_len'] = 256 + self.llm_config['img_start'] = self.tokenizer.img_start_id + self.llm_config['img_end'] = self.tokenizer.img_end_id + self.llm_config['img_pad'] = self.tokenizer.img_pad_id + def load_model(self): # Qwen models @@ -710,7 +824,7 @@ def load_model(self): self.image_start_id = transformer.config.visual['image_start_id'] self.image_size = transformer.config.visual['image_size'] # some wrapper - self.stop_id = self.tokenizer.im_end_id + self.stop_ids.append(self.tokenizer.im_end_id) self.block_nums = len(self.blocks_) self.hidden_size = transformer.embed_dim self.embed = Embedding(self.embed_, self.embed_bf16) @@ -742,7 +856,7 @@ def build_prompt(self, query): def get_attention_mask(self) -> torch.Tensor: if self.model_name == 'Qwen-VL': if self.token_len: - return torch.zeros([1, 1, 1, self.seq_len], dtype=torch.float32) + return torch.zeros([1, 1, 1, 1], dtype=torch.float32) return (1 - torch.tril(torch.ones([1, 1, self.seq_len, self.seq_len]))) * torch.finfo(torch.float32).min if self.token_len: return torch.ones([1, 1, 1, 1]).bool() @@ -794,6 +908,7 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv): past_key_value=past_kv, rotary_pos_emb=rotary_pos_emb, use_cache=True) + if self.final_layernorm is not None: hidden_states = self.final_layernorm(hidden_states) hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size) @@ -804,6 +919,7 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv): class Qwen2_Chat(LLM): def __init__(self, args): + self.attention_mask_type = 'float' super().__init__(args) def load_model(self): @@ -815,14 +931,14 @@ def load_model(self): self.blocks_ = transformer.layers self.final_layernorm_ = transformer.norm # some wrapper - self.stop_id = self.tokenizer.eos_token_id + self.stop_ids.append(self.tokenizer.eos_token_id) if hasattr(self.model, 'generation_config'): - self.stop_ids.append(self.stop_id) for id in self.model.generation_config.eos_token_id: self.stop_ids.append(id) self.block_nums = self.config.num_hidden_layers self.hidden_size = self.config.hidden_size self.num_heads = self.config.num_attention_heads + self.kv_heads = self.config.num_key_value_heads self.rope_theta = self.config.rope_theta self.head_dim = self.hidden_size // self.num_heads if self.embed_.weight is self.lm_.weight: @@ -832,7 +948,7 @@ def load_model(self): else: self.embed = Embedding(self.embed_, self.embed_bf16) self.lm = Lm(self.lm_) - self.past_kv_shape = [self.block_nums, 2, 1, 0, self.num_heads, self.head_dim] + self.past_kv_shape = [self.block_nums, 2, 1, 0, self.kv_heads, self.head_dim] self.blocks = [QWEN2Block(self.model_name, self.blocks_[i], i, self.config, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)] # some config for export self.block_dynamic_axes = { @@ -881,19 +997,28 @@ def visual_embed(self, input_ids): # llama2 class LLAMA2Block(torch.nn.Module): - def __init__(self, block, block_id, hidden_size, final_layernorm = None): + def __init__(self, block, block_id, hidden_size, head_dim, final_layernorm = None): super().__init__() self.block = block self.block_id = block_id + self.head_dim = head_dim self.final_layernorm = final_layernorm self.hidden_size = hidden_size def forward(self, hidden_states, attention_mask, position_ids, past_kv): + theta = 1.0 / (10000.0 ** (torch.arange(0, self.head_dim, 2, dtype=torch.float32) / self.head_dim)) + position_ids = position_ids.float().reshape(-1, 1) + idx_theta = position_ids * theta + rotary_pos_emb = torch.cat((idx_theta, idx_theta), dim=-1) + rotary_pos_emb = rotary_pos_emb.unsqueeze(1).unsqueeze(0) + rotary_pos_emb = torch.stack([torch.cos(rotary_pos_emb), torch.sin(rotary_pos_emb)]) hidden_states = hidden_states.view(1, -1, self.hidden_size) + position_ids = position_ids.view(1, -1) hidden_states, presents = self.block(hidden_states, attention_mask, position_ids, past_kv, + rotary_pos_emb=rotary_pos_emb, use_cache=True) if self.final_layernorm is not None: hidden_states = self.final_layernorm(hidden_states) @@ -904,6 +1029,7 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv): class Llama2_7b_Chat(LLM): def __init__(self, args): + self.attention_mask_type = 'float' self.model_name = 'Llama2_7b' if 'Baichuan2' in args.path: self.model_name = 'Baichuan2_7B' @@ -928,33 +1054,35 @@ def load_model(self): self.final_layernorm_ = transformer.norm # some wrapper self.hidden_size = self.embed_.weight.shape[-1] - self.stop_id = self.tokenizer.eos_token_id + self.stop_ids.append(self.tokenizer.eos_token_id) if hasattr(self.model, 'generation_config'): - self.stop_ids.append(self.stop_id) self.stop_ids.append(self.model.generation_config.eos_token_id) if self.model_name == 'Llama3_8B': self.stop_ids.append(self.tokenizer.convert_tokens_to_ids("<|eot_id|>")) self.block_nums = len(self.blocks_) self.embed = Embedding(self.embed_, self.embed_bf16) self.lm = Lm(self.lm_) - self.blocks = [LLAMA2Block(self.blocks_[i], i, self.hidden_size, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)] self.block_nums = self.config.num_hidden_layers self.hidden_size = self.config.hidden_size self.num_attention_heads = self.config.num_attention_heads self.head_dim = self.hidden_size // self.num_attention_heads - self.num_key_value_heads = self.config.num_key_value_heads - self.past_kv_shape = [self.block_nums, 2, 1, self.num_key_value_heads, 0, self.head_dim] + if hasattr(self.config, 'num_key_value_heads'): + self.num_key_value_heads = self.config.num_key_value_heads + else: + self.num_key_value_heads = self.config.num_attention_heads + self.blocks = [LLAMA2Block(self.blocks_[i], i, self.hidden_size, self.head_dim, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)] + self.past_kv_shape = [self.block_nums, 2, 1, 0, self.num_key_value_heads, self.head_dim] self.block_dynamic_axes = { "inputs_embeds" : { 0: "seq_len" }, "attention_mask" : { 2: "seq_len", 3: "seq_len" }, "position_ids" : { 1: "seq_len" }, - "past_key_values" : { 3: "history_len" } + "past_key_values" : { 2: "history_len" } } self.model_dynamic_axes = { "input_ids" : { 0: "seq_len" }, "attention_mask" : { 2: "seq_len", 3: "seq_len" }, "position_ids" : { 1: "seq_len" }, - "past_key_values" : { 4: "history_len" } + "past_key_values" : { 3: "history_len" } } def build_prompt(self, query): @@ -967,10 +1095,10 @@ def build_prompt(self, query): if 'Yi' in self.model_name: return f'<|im_start|> user\n{query}<|im_end|>\n<|im_start|> assistant\n' if 'deepseek' in self.model_name: - return f'<|begin▁of▁sentence|>User: {query}\nAssistant:' + return f'<|begin_of_sentence|>User: {query}\n\nAssistant:' if 'Llama3' in self.model_name: return f'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{query}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' - return f'[INST]{query}[/INST]' + return f'[INST]{query}[/INST]' def get_attention_mask(self) -> torch.Tensor: if self.token_len: @@ -1007,6 +1135,7 @@ def forward(self, hidden_states, attention_mask, position_ids, past_kv): class phi_2(LLM): def __init__(self, args): + self.attention_mask_type = 'glm' super().__init__(args) self.model_name = 'phi-2' self.asymmetric = False # TODO: some precision bug when using asymmetric @@ -1019,7 +1148,7 @@ def load_model(self): self.blocks_ = transformer.h # self.final_layernorm_ = transformer.final_layernorm # some wrapper - self.stop_id = self.tokenizer.eos_token_id + self.stop_ids.append(self.tokenizer.eos_token_id) self.block_nums = len(self.blocks_) self.embed = Embedding(self.embed_, self.embed_bf16) self.lm = Lm(self.lm_) @@ -1067,6 +1196,8 @@ def forward(self, hidden_states, attention_mask): class bge(LLM): def __init__(self, args): + self.attention_mask_type = 'int' + self.past_kv_shape = [] super().__init__(args) self.model_name = 'bge-large-zh' @@ -1092,13 +1223,14 @@ def response(self, query): return res def load_model(self): + self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True).float().eval() transformer = self.model.encoder self.lm_ = self.model.pooler self.embed_ = self.model.embeddings self.hidden_size = self.embed_.word_embeddings.weight.shape[-1] self.blocks_ = transformer.layer # some wrapper - self.stop_id = self.tokenizer.eos_token_id + self.stop_ids = [] self.block_nums = len(self.blocks_) self.embed = self.embed_ self.lm = self.lm_ @@ -1157,6 +1289,9 @@ def export(self): if self.export_mnn: onnx2mnn(onnx_model, self.mnn_path, 8, True, bizCode=token_str) + def build_prompt(self, query): + return f'[CLS]{query}[SEP]' + def get_position_ids(self) -> torch.Tensor: return torch.arange(self.seq_len, dtype=torch.long).unsqueeze(0) @@ -1189,8 +1324,9 @@ def export(self): llm_models = { 'chatglm-6b': Chatglm_6b, 'chatglm2-6b': Chatglm2_6b, - 'chatglm3-6b': Chatglm3_6b, 'codegeex2-6b': Chatglm2_6b, + 'chatglm3-6b': Chatglm3_6b, + 'glm-4-9b-chat': Chatglm3_6b, 'Qwen-7B-Chat': Qwen_Chat, 'Qwen-1_8B-Chat': Qwen_Chat, 'Qwen-1_8B': Qwen_Chat, @@ -1199,6 +1335,9 @@ def export(self): 'Qwen1_5-1_8B-Chat': Qwen2_Chat, 'Qwen1_5-4B-Chat': Qwen2_Chat, 'Qwen1_5-7B-Chat': Qwen2_Chat, + 'Qwen2-0_5B-Instruct': Qwen2_Chat, + 'Qwen2-1_5B-Instruct': Qwen2_Chat, + 'Qwen2-7B-Instruct': Qwen2_Chat, 'Baichuan2-7B-Chat': Llama2_7b_Chat, 'Llama-2-7b-chat-ms': Llama2_7b_Chat, 'Llama-3-8B-Instruct': Llama2_7b_Chat, @@ -1235,16 +1374,17 @@ def export(self): '\n\t- block models.' '\n\t- lm_head model.' ) - parser.add_argument('--export_token', action='store_true', help='export llm tokenizer to a txt file.') - parser.add_argument('--export_embed', action='store_true', help='export llm embedding to an `onnx` model.') parser.add_argument('--export_visual', action='store_true', help='export llm visual model to an `onnx` model.') parser.add_argument('--export_lm', action='store_true', help='export llm lm_head to an `onnx` model.') parser.add_argument('--export_block', type=int, help='export llm block [id] to an `onnx` model.') parser.add_argument('--export_blocks', action='store_true', help='export llm all blocks to `onnx` models.') - parser.add_argument('--embed_bin', action='store_true', help='export embedding weight as bin file with dtype `bfloat16`') - parser.add_argument('--embed_bf16', action='store_true', help='using `bfloat16` replace `float32` in embedding.') parser.add_argument('--skip_slim', action='store_true', help='Whether or not to skip onnx-slim.') + # No use now, add invoid of call error + parser.add_argument('--export_token', action='store_true', help='export llm tokenizer to a txt file.') + parser.add_argument('--export_embed', action='store_true', help='export llm embedding to an `onnx` model.') + parser.add_argument('--embed_bf16', default=True, action='store_true', help='using `bfloat16` replace `float32` in embedding.') + parser.add_argument('--embed_bin', action='store_true', help='export embedding weight as bin file with dtype `bfloat16`') args = parser.parse_args() model_path = args.path @@ -1267,14 +1407,15 @@ def export(self): if args.test is not None: llm_exporter.response(args.test) + if args.export or args.export_split: + llm_exporter.export_config(args.export) + if args.export: llm_exporter.export() - if args.export_token: - llm_exporter.export_tokenizer() + llm_exporter.export_tokenizer() - if args.export_embed or args.export_split: - llm_exporter.export_embed() + llm_exporter.export_embed() if args.export_visual or args.export_split: llm_exporter.export_visual() @@ -1286,4 +1427,4 @@ def export(self): llm_exporter.export_blocks() if args.export_block is not None: - llm_exporter.export_block(args.export_block) + llm_exporter.export_block(args.export_block) \ No newline at end of file diff --git a/transformers/llm/export/llm_models/Baichuan2-7B-Chat/modeling_baichuan.py b/transformers/llm/export/llm_models/Baichuan2-7B-Chat/modeling_baichuan.py index 9f2968cc4..5a0b69e83 100755 --- a/transformers/llm/export/llm_models/Baichuan2-7B-Chat/modeling_baichuan.py +++ b/transformers/llm/export/llm_models/Baichuan2-7B-Chat/modeling_baichuan.py @@ -128,7 +128,7 @@ def forward(self, x, seq_len=None): self.sin_cached = emb.sin()[None, None, :, :].to(torch.float32).to(x.device) elif self.cos_cached.device != x.device: self.cos_cached = self.cos_cached.to(x.device) - self.sin_cached = self.sin_cached.to(x.device) + self.sin_cached = self.sin_cached.to(x.device) return ( self.cos_cached[:, :, :seq_len, ...], self.sin_cached[:, :, :seq_len, ...], @@ -149,8 +149,8 @@ def apply_rotary_pos_emb(q, k, cos_, sin_, position_ids): # cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] # sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] # print(f'### q.shape = {q.shape}, cos.shape = {cos.shape}') - cos = cos[position_ids] - sin = sin[position_ids] + # cos = cos[position_ids] + # sin = sin[position_ids] q_embed = (q.float() * cos) + (rotate_half(q.float()) * sin) k_embed = (k.float() * cos) + (rotate_half(k.float()) * sin) return q_embed.to(q.dtype), k_embed.to(k.dtype) @@ -205,6 +205,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: Optional[torch.Tensor] = None, output_attentions: bool = False, use_cache: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: @@ -212,6 +213,7 @@ def forward( proj = self.W_pack(hidden_states) proj = proj.reshape([1, -1, 3, 4096]).permute([2, 0, 1, 3]) + ''' # proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2) query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) @@ -220,7 +222,10 @@ def forward( kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + if rotary_pos_emb is None: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + else: + cos, sin = rotary_pos_emb query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) # [bsz, nh, t, hd] @@ -239,13 +244,35 @@ def forward( query_states, key_states, value_states, attn_bias=xops.LowerTriangularMask() ) else: - ''' - with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): - attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = attention_mask) - ''' attn_output = self.raw_atten(query_states, key_states, value_states, attention_mask) attn_output = attn_output.transpose(1, 2) - + ''' + #--------------- + query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim) + key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim) + value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim) + kv_seq_len = key_states.shape[1] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[1] + # rope + cos, sin = rotary_pos_emb + query_states = (query_states * cos) + (rotate_half(query_states) * sin) + key_states = (key_states * cos) + (rotate_half(key_states) * sin) + # kv cache + if past_key_value is not None: + past_key, past_value = past_key_value[0], past_key_value[1] + key_states = torch.cat((past_key, key_states), dim=1) + value_states = torch.cat((past_value, value_states), dim=1) + past_key_value = torch.stack((key_states, value_states)) + query_states = query_states.transpose(1, 2) + key_states = key_states.permute([0, 2, 3, 1]) + value_states = value_states.transpose(1, 2) + attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim) + attn_weights = attn_weights + attention_mask + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + #--------------- attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) attn_output = self.o_proj(attn_output) @@ -274,6 +301,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: @@ -288,6 +316,7 @@ def forward( attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_value, + rotary_pos_emb=rotary_pos_emb, output_attentions=output_attentions, use_cache=use_cache, ) @@ -567,7 +596,7 @@ def set_decoder(self, decoder): def get_decoder(self): return self.model - + @classmethod def from_pretrained( cls, @@ -603,7 +632,7 @@ def from_pretrained( ) else: model_kwargs = kwargs - + if hasattr(config, "quantization_config") and config.quantization_config['load_in_4bit']: try: from .quantizer import init_model_weight_int4 @@ -611,20 +640,20 @@ def from_pretrained( from accelerate.utils import CustomDtype from accelerate.utils import get_balanced_memory except ImportError: - raise ImportError(f"Needs import model weight init func to run quantize.") + raise ImportError(f"Needs import model weight init func to run quantize.") # Instantiate model. init_contexts = [no_init_weights(_enable=True)] init_contexts.append(init_empty_weights()) with ContextManagers(init_contexts): model = cls(config) - + model_file = os.path.join(pretrained_model_name_or_path, 'pytorch_model.bin') - state_dict = torch.load(model_file, map_location="cpu") + state_dict = torch.load(model_file, map_location="cpu") model.is_quantized = True - + device_map = kwargs.pop("device_map", None) torch_dtype = kwargs.pop("torch_dtype", None) - + kwargs = {"no_split_module_classes": model._no_split_modules} target_dtype = CustomDtype.INT4 max_memory = get_balanced_memory( @@ -635,10 +664,10 @@ def from_pretrained( **kwargs, ) kwargs["max_memory"] = max_memory - + device_map = infer_auto_device_map(model, dtype=target_dtype, **kwargs) model = init_model_weight_int4(config, model, state_dict) - + # Set model in evaluation mode to deactivate DropOut modules by default model.eval() # If it is a model with generation capabilities, attempt to load the generation config @@ -663,15 +692,15 @@ def from_pretrained( "Generation config file not found, using a generation config created from the model config." ) pass - + if device_map is not None: dispatch_model(model, device_map=device_map) - + return model - return super(BaichuanForCausalLM, cls).from_pretrained(pretrained_model_name_or_path, *model_args, - config=config, cache_dir=cache_dir, ignore_mismatched_sizes=ignore_mismatched_sizes, - force_download=force_download, local_files_only=local_files_only, token=token, revision=revision, - use_safetensors=use_safetensors, **kwargs) + return super(BaichuanForCausalLM, cls).from_pretrained(pretrained_model_name_or_path, *model_args, + config=config, cache_dir=cache_dir, ignore_mismatched_sizes=ignore_mismatched_sizes, + force_download=force_download, local_files_only=local_files_only, token=token, revision=revision, + use_safetensors=use_safetensors, **kwargs) def forward( self, diff --git a/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/modeling_llama.py b/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/modeling_llama.py index 8c562c604..493b040b7 100644 --- a/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/modeling_llama.py +++ b/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/modeling_llama.py @@ -182,8 +182,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids): # sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] cos = torch.squeeze(cos) # [seq_len, dim] sin = torch.squeeze(sin) # [seq_len, dim] - cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] - sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + # cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + # sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed @@ -282,6 +282,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: Optional[torch.Tensor] = None, output_attentions: bool = False, use_cache: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: @@ -306,7 +307,7 @@ def forward( query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - + ''' query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) @@ -314,7 +315,10 @@ def forward( kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + if rotary_pos_emb is None: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + else: + cos, sin = rotary_pos_emb query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: @@ -327,8 +331,32 @@ def forward( # repeat k/v heads if n_kv_heads < n_heads key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + ''' + #--------------- + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + kv_seq_len = key_states.shape[1] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[1] + # rope + cos, sin = rotary_pos_emb + query_states = (query_states * cos) + (rotate_half(query_states) * sin) + key_states = (key_states * cos) + (rotate_half(key_states) * sin) + # kv cache + if past_key_value is not None: + past_key, past_value = past_key_value[0], past_key_value[1] + key_states = torch.cat((past_key, key_states), dim=1) + value_states = torch.cat((past_value, value_states), dim=1) + past_key_value = torch.stack((key_states, value_states)) + query_states = query_states.transpose(1, 2) + key_states = key_states.permute([0, 2, 3, 1]) + value_states = value_states.transpose(1, 2) + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + #--------------- + attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim) if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( @@ -384,6 +412,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: @@ -411,6 +440,7 @@ def forward( attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_value, + rotary_pos_emb=rotary_pos_emb, output_attentions=output_attentions, use_cache=use_cache, ) diff --git a/transformers/llm/export/llm_models/Llama-3-8B-Instruct/modeling_llama.py b/transformers/llm/export/llm_models/Llama-3-8B-Instruct/modeling_llama.py index 8c562c604..493b040b7 100644 --- a/transformers/llm/export/llm_models/Llama-3-8B-Instruct/modeling_llama.py +++ b/transformers/llm/export/llm_models/Llama-3-8B-Instruct/modeling_llama.py @@ -182,8 +182,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids): # sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] cos = torch.squeeze(cos) # [seq_len, dim] sin = torch.squeeze(sin) # [seq_len, dim] - cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] - sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + # cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + # sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed @@ -282,6 +282,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: Optional[torch.Tensor] = None, output_attentions: bool = False, use_cache: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: @@ -306,7 +307,7 @@ def forward( query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - + ''' query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) @@ -314,7 +315,10 @@ def forward( kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + if rotary_pos_emb is None: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + else: + cos, sin = rotary_pos_emb query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: @@ -327,8 +331,32 @@ def forward( # repeat k/v heads if n_kv_heads < n_heads key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + ''' + #--------------- + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + kv_seq_len = key_states.shape[1] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[1] + # rope + cos, sin = rotary_pos_emb + query_states = (query_states * cos) + (rotate_half(query_states) * sin) + key_states = (key_states * cos) + (rotate_half(key_states) * sin) + # kv cache + if past_key_value is not None: + past_key, past_value = past_key_value[0], past_key_value[1] + key_states = torch.cat((past_key, key_states), dim=1) + value_states = torch.cat((past_value, value_states), dim=1) + past_key_value = torch.stack((key_states, value_states)) + query_states = query_states.transpose(1, 2) + key_states = key_states.permute([0, 2, 3, 1]) + value_states = value_states.transpose(1, 2) + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + #--------------- + attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim) if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( @@ -384,6 +412,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: @@ -411,6 +440,7 @@ def forward( attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_value, + rotary_pos_emb=rotary_pos_emb, output_attentions=output_attentions, use_cache=use_cache, ) diff --git a/transformers/llm/export/llm_models/MiniCPM-1.2b/config.json b/transformers/llm/export/llm_models/MiniCPM-1.2b/config.json old mode 100755 new mode 100644 diff --git a/transformers/llm/export/llm_models/MiniCPM-2.4b/config.json b/transformers/llm/export/llm_models/MiniCPM-2.4b/config.json old mode 100755 new mode 100644 diff --git a/transformers/llm/export/llm_models/Qwen-7B-Chat/modeling_qwen.py b/transformers/llm/export/llm_models/Qwen-7B-Chat/modeling_qwen.py index 94437bcd2..698486f6f 100644 --- a/transformers/llm/export/llm_models/Qwen-7B-Chat/modeling_qwen.py +++ b/transformers/llm/export/llm_models/Qwen-7B-Chat/modeling_qwen.py @@ -250,12 +250,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None): attn_weights = torch.matmul(query, key.transpose(-1, -2)) if self.scale_attn_weights: - attn_weights = attn_weights / torch.full( - [], - value.size(-1) ** 0.5, - dtype=attn_weights.dtype, - device=attn_weights.device, - ) + attn_weights = attn_weights / math.sqrt(self.head_dim) query_length, key_length = query.size(-2), key.size(-2) if attention_mask is None: @@ -813,7 +808,7 @@ def __init__(self, config): logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".") elif SUPPORT_FP16: logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".") - + if config.use_flash_attn == "auto": if config.bf16 or config.fp16: logger.warn("Try importing flash-attention for faster inference...") diff --git a/transformers/llm/export/llm_models/Qwen-VL-Chat/modeling_qwen.py b/transformers/llm/export/llm_models/Qwen-VL-Chat/modeling_qwen.py index 94c7453b1..d7b3c4798 100755 --- a/transformers/llm/export/llm_models/Qwen-VL-Chat/modeling_qwen.py +++ b/transformers/llm/export/llm_models/Qwen-VL-Chat/modeling_qwen.py @@ -149,14 +149,8 @@ def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, attn_weights = torch.matmul(query, key.transpose(-1, -2)) if self.scale_attn_weights: - attn_weights = attn_weights / torch.full( - [], - value.size(-1) ** 0.5, - dtype=attn_weights.dtype, - device=attn_weights.device, - ) + attn_weights = attn_weights / math.sqrt(self.head_dim) - query_length, key_length = query.size(-2), key.size(-2) # causal_mask = self.bias[ # :, :, key_length - query_length : key_length, :key_length # ] @@ -295,7 +289,7 @@ def forward( else: present = None - if self.use_logn_attn and not self.training: + if self.use_logn_attn and not self.training and False: if self.logn_tensor.device != query.device or self.logn_tensor.dtype != query.dtype: self.logn_tensor = self.logn_tensor.to(query.device).type_as(query) seq_start = key.size(1) - query.size(1) @@ -515,7 +509,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, new_embeddings): self.wte = new_embeddings - + # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): # create causal mask @@ -1110,7 +1104,7 @@ def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0): self._ntk_alpha_cached = ntk_alpha seq = torch.arange(self._seq_len_cached, device=self.inv_freq.device) freqs = torch.outer(seq.type_as(self.inv_freq), self.inv_freq) - + emb = torch.cat((freqs, freqs), dim=-1) from einops import rearrange diff --git a/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/modeling_qwen2.py index 8afc2ecb5..595a3e91c 100644 --- a/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/modeling_qwen2.py +++ b/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/modeling_qwen2.py @@ -313,10 +313,12 @@ def forward( value_states = torch.cat((past_value, value_states), dim=1) past_key_value = torch.stack((key_states, value_states)) query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) + key_states = key_states.permute([0, 2, 3, 1]) value_states = value_states.transpose(1, 2) + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) #--------------- - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim) if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( diff --git a/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/modeling_qwen2.py index 8afc2ecb5..595a3e91c 100644 --- a/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/modeling_qwen2.py +++ b/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/modeling_qwen2.py @@ -313,10 +313,12 @@ def forward( value_states = torch.cat((past_value, value_states), dim=1) past_key_value = torch.stack((key_states, value_states)) query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) + key_states = key_states.permute([0, 2, 3, 1]) value_states = value_states.transpose(1, 2) + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) #--------------- - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim) if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( diff --git a/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/modeling_qwen2.py index 8afc2ecb5..595a3e91c 100644 --- a/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/modeling_qwen2.py +++ b/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/modeling_qwen2.py @@ -313,10 +313,12 @@ def forward( value_states = torch.cat((past_value, value_states), dim=1) past_key_value = torch.stack((key_states, value_states)) query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) + key_states = key_states.permute([0, 2, 3, 1]) value_states = value_states.transpose(1, 2) + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) #--------------- - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim) if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( diff --git a/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/modeling_qwen2.py index 8afc2ecb5..595a3e91c 100644 --- a/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/modeling_qwen2.py +++ b/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/modeling_qwen2.py @@ -313,10 +313,12 @@ def forward( value_states = torch.cat((past_value, value_states), dim=1) past_key_value = torch.stack((key_states, value_states)) query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) + key_states = key_states.permute([0, 2, 3, 1]) value_states = value_states.transpose(1, 2) + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) #--------------- - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim) if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( diff --git a/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/config.json b/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/config.json new file mode 100755 index 000000000..8f9ea8a58 --- /dev/null +++ b/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/config.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "auto_map": { + "AutoConfig": "configuration_qwen2.Qwen2Config", + "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM" + }, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 896, + "initializer_range": 0.02, + "intermediate_size": 4864, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 14, + "num_hidden_layers": 24, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.40.1", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/configuration_qwen2.py b/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/configuration_qwen2.py new file mode 100644 index 000000000..b6ca1ed43 --- /dev/null +++ b/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/configuration_qwen2.py @@ -0,0 +1,144 @@ +# coding=utf-8 +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Qwen2 model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json", +} + + +class Qwen2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a + Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of + Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 151936): + Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Qwen2Model`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 22016): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 32): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + use_sliding_window (`bool`, *optional*, defaults to `False`): + Whether to use sliding window attention. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention (SWA) window size. If not specified, will default to `4096`. + max_window_layers (`int`, *optional*, defaults to 28): + The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import Qwen2Model, Qwen2Config + + >>> # Initializing a Qwen2 style configuration + >>> configuration = Qwen2Config() + + >>> # Initializing a model from the Qwen2-7B style configuration + >>> model = Qwen2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "qwen2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=151936, + hidden_size=4096, + intermediate_size=22016, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=32, + hidden_act="silu", + max_position_embeddings=32768, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + tie_word_embeddings=False, + rope_theta=10000.0, + use_sliding_window=False, + sliding_window=4096, + max_window_layers=28, + attention_dropout=0.0, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.use_sliding_window = use_sliding_window + self.sliding_window = sliding_window + self.max_window_layers = max_window_layers + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + + super().__init__( + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/modeling_qwen2.py new file mode 100644 index 000000000..595a3e91c --- /dev/null +++ b/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/modeling_qwen2.py @@ -0,0 +1,1436 @@ +# coding=utf-8 +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Qwen2 model.""" +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_qwen2 import Qwen2Config + + +# if is_flash_attn_2_available(): + #from flash_attn import flash_attn_func, flash_attn_varlen_func + #from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + #_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + + +_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta" +_CONFIG_FOR_DOC = "Qwen2Config" + +QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "Qwen/Qwen2-7B-beta", + # See all Qwen2 models at https://huggingface.co/models?filter=qwen2 +] + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2 +class Qwen2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Qwen2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2 +class Qwen2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2 +class Qwen2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class Qwen2Attention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " + "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + self.rotary_emb = Qwen2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + rotary_pos_emb: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[torch.Tensor] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + ''' + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + kv_seq_len += past_key_value[0].shape[2] + if rotary_pos_emb is None: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + else: + cos, sin = rotary_pos_emb + query_states = (query_states * cos) + (rotate_half(query_states) * sin) + key_states = (key_states * cos) + (rotate_half(key_states) * sin) + + if past_key_value is not None: + past_key, past_value = past_key_value[0], past_key_value[1] + key_states = torch.cat((past_key, key_states), dim=2) + value_states = torch.cat((past_value, value_states), dim=2) + # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + past_key_value = torch.stack((key_states, value_states)) + # repeat k/v heads if n_kv_heads < n_heads + # key_states = repeat_kv(key_states, self.num_key_value_groups) + # value_states = repeat_kv(value_states, self.num_key_value_groups) + ''' + #--------------- + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + kv_seq_len = key_states.shape[1] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[1] + # rope + cos, sin = rotary_pos_emb + query_states = (query_states * cos) + (rotate_half(query_states) * sin) + key_states = (key_states * cos) + (rotate_half(key_states) * sin) + # kv cache + if past_key_value is not None: + past_key, past_value = past_key_value[0], past_key_value[1] + key_states = torch.cat((past_key, key_states), dim=1) + value_states = torch.cat((past_value, value_states), dim=1) + past_key_value = torch.stack((key_states, value_states)) + query_states = query_states.transpose(1, 2) + key_states = key_states.permute([0, 2, 3, 1]) + value_states = value_states.transpose(1, 2) + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + #--------------- + attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class Qwen2FlashAttention2(Qwen2Attention): + """ + Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention` + as the weights of the module stays untouched. The only required change would be on the forward pass + where it needs to correctly call the public API of flash attention and deal with padding tokens + in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom + config.max_window_layers layers. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and self.config.use_sliding_window + ) + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" + " make sure to upgrade flash-attn library." + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Decide whether to use SWA or not by layer index. + if use_sliding_windows and self.layer_idx >= self.config.max_window_layers: + use_sliding_windows = False + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2 +class Qwen2SdpaAttention(Qwen2Attention): + """ + Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from Qwen2Attention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +QWEN2_ATTENTION_CLASSES = { + "eager": Qwen2Attention, + "flash_attention_2": Qwen2FlashAttention2, + "sdpa": Qwen2SdpaAttention, +} + + +class Qwen2DecoderLayer(nn.Module): + def __init__(self, config: Qwen2Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + if config.use_sliding_window and config._attn_implementation != "flash_attention_2": + logger.warning_once( + f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; " + "unexpected results may be encountered." + ) + # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + self.self_attn = Qwen2Attention(config, layer_idx) + + self.mlp = Qwen2MLP(config) + self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + rotary_pos_emb: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + rotary_pos_emb=rotary_pos_emb, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +QWEN2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Qwen2Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.", + QWEN2_START_DOCSTRING, +) +class Qwen2PreTrainedModel(PreTrainedModel): + config_class = Qwen2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Qwen2DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +QWEN2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.", + QWEN2_START_DOCSTRING, +) +class Qwen2Model(Qwen2PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`] + + Args: + config: Qwen2Config + """ + + def __init__(self, config: Qwen2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self._attn_implementation = config._attn_implementation + self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class Qwen2ForCausalLM(Qwen2PreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = Qwen2Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, Qwen2ForCausalLM + + >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + The Qwen2 Model transformer with a sequence classification head on top (linear layer). + + [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + QWEN2_START_DOCSTRING, +) +class Qwen2ForSequenceClassification(Qwen2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Qwen2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/config.json b/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/config.json new file mode 100755 index 000000000..bdc572b07 --- /dev/null +++ b/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/config.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "auto_map": { + "AutoConfig": "configuration_qwen2.Qwen2Config", + "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM" + }, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.40.1", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/configuration_qwen2.py b/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/configuration_qwen2.py new file mode 100644 index 000000000..b6ca1ed43 --- /dev/null +++ b/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/configuration_qwen2.py @@ -0,0 +1,144 @@ +# coding=utf-8 +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Qwen2 model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json", +} + + +class Qwen2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a + Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of + Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 151936): + Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Qwen2Model`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 22016): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 32): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + use_sliding_window (`bool`, *optional*, defaults to `False`): + Whether to use sliding window attention. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention (SWA) window size. If not specified, will default to `4096`. + max_window_layers (`int`, *optional*, defaults to 28): + The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import Qwen2Model, Qwen2Config + + >>> # Initializing a Qwen2 style configuration + >>> configuration = Qwen2Config() + + >>> # Initializing a model from the Qwen2-7B style configuration + >>> model = Qwen2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "qwen2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=151936, + hidden_size=4096, + intermediate_size=22016, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=32, + hidden_act="silu", + max_position_embeddings=32768, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + tie_word_embeddings=False, + rope_theta=10000.0, + use_sliding_window=False, + sliding_window=4096, + max_window_layers=28, + attention_dropout=0.0, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.use_sliding_window = use_sliding_window + self.sliding_window = sliding_window + self.max_window_layers = max_window_layers + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + + super().__init__( + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/modeling_qwen2.py new file mode 100644 index 000000000..595a3e91c --- /dev/null +++ b/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/modeling_qwen2.py @@ -0,0 +1,1436 @@ +# coding=utf-8 +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Qwen2 model.""" +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_qwen2 import Qwen2Config + + +# if is_flash_attn_2_available(): + #from flash_attn import flash_attn_func, flash_attn_varlen_func + #from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + #_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + + +_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta" +_CONFIG_FOR_DOC = "Qwen2Config" + +QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "Qwen/Qwen2-7B-beta", + # See all Qwen2 models at https://huggingface.co/models?filter=qwen2 +] + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2 +class Qwen2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Qwen2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2 +class Qwen2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2 +class Qwen2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class Qwen2Attention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " + "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + self.rotary_emb = Qwen2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + rotary_pos_emb: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[torch.Tensor] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + ''' + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + kv_seq_len += past_key_value[0].shape[2] + if rotary_pos_emb is None: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + else: + cos, sin = rotary_pos_emb + query_states = (query_states * cos) + (rotate_half(query_states) * sin) + key_states = (key_states * cos) + (rotate_half(key_states) * sin) + + if past_key_value is not None: + past_key, past_value = past_key_value[0], past_key_value[1] + key_states = torch.cat((past_key, key_states), dim=2) + value_states = torch.cat((past_value, value_states), dim=2) + # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + past_key_value = torch.stack((key_states, value_states)) + # repeat k/v heads if n_kv_heads < n_heads + # key_states = repeat_kv(key_states, self.num_key_value_groups) + # value_states = repeat_kv(value_states, self.num_key_value_groups) + ''' + #--------------- + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + kv_seq_len = key_states.shape[1] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[1] + # rope + cos, sin = rotary_pos_emb + query_states = (query_states * cos) + (rotate_half(query_states) * sin) + key_states = (key_states * cos) + (rotate_half(key_states) * sin) + # kv cache + if past_key_value is not None: + past_key, past_value = past_key_value[0], past_key_value[1] + key_states = torch.cat((past_key, key_states), dim=1) + value_states = torch.cat((past_value, value_states), dim=1) + past_key_value = torch.stack((key_states, value_states)) + query_states = query_states.transpose(1, 2) + key_states = key_states.permute([0, 2, 3, 1]) + value_states = value_states.transpose(1, 2) + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + #--------------- + attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class Qwen2FlashAttention2(Qwen2Attention): + """ + Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention` + as the weights of the module stays untouched. The only required change would be on the forward pass + where it needs to correctly call the public API of flash attention and deal with padding tokens + in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom + config.max_window_layers layers. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and self.config.use_sliding_window + ) + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" + " make sure to upgrade flash-attn library." + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Decide whether to use SWA or not by layer index. + if use_sliding_windows and self.layer_idx >= self.config.max_window_layers: + use_sliding_windows = False + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2 +class Qwen2SdpaAttention(Qwen2Attention): + """ + Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from Qwen2Attention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +QWEN2_ATTENTION_CLASSES = { + "eager": Qwen2Attention, + "flash_attention_2": Qwen2FlashAttention2, + "sdpa": Qwen2SdpaAttention, +} + + +class Qwen2DecoderLayer(nn.Module): + def __init__(self, config: Qwen2Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + if config.use_sliding_window and config._attn_implementation != "flash_attention_2": + logger.warning_once( + f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; " + "unexpected results may be encountered." + ) + # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + self.self_attn = Qwen2Attention(config, layer_idx) + + self.mlp = Qwen2MLP(config) + self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + rotary_pos_emb: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + rotary_pos_emb=rotary_pos_emb, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +QWEN2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Qwen2Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.", + QWEN2_START_DOCSTRING, +) +class Qwen2PreTrainedModel(PreTrainedModel): + config_class = Qwen2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Qwen2DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +QWEN2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.", + QWEN2_START_DOCSTRING, +) +class Qwen2Model(Qwen2PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`] + + Args: + config: Qwen2Config + """ + + def __init__(self, config: Qwen2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self._attn_implementation = config._attn_implementation + self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class Qwen2ForCausalLM(Qwen2PreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = Qwen2Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, Qwen2ForCausalLM + + >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + The Qwen2 Model transformer with a sequence classification head on top (linear layer). + + [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + QWEN2_START_DOCSTRING, +) +class Qwen2ForSequenceClassification(Qwen2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Qwen2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B/config.json b/transformers/llm/export/llm_models/Qwen2-1_5B/config.json new file mode 100755 index 000000000..08a0ac476 --- /dev/null +++ b/transformers/llm/export/llm_models/Qwen2-1_5B/config.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "auto_map": { + "AutoConfig": "configuration_qwen2.Qwen2Config", + "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM" + }, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.38.2", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B/configuration_qwen2.py b/transformers/llm/export/llm_models/Qwen2-1_5B/configuration_qwen2.py new file mode 100644 index 000000000..b6ca1ed43 --- /dev/null +++ b/transformers/llm/export/llm_models/Qwen2-1_5B/configuration_qwen2.py @@ -0,0 +1,144 @@ +# coding=utf-8 +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Qwen2 model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json", +} + + +class Qwen2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a + Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of + Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 151936): + Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Qwen2Model`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 22016): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 32): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + use_sliding_window (`bool`, *optional*, defaults to `False`): + Whether to use sliding window attention. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention (SWA) window size. If not specified, will default to `4096`. + max_window_layers (`int`, *optional*, defaults to 28): + The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import Qwen2Model, Qwen2Config + + >>> # Initializing a Qwen2 style configuration + >>> configuration = Qwen2Config() + + >>> # Initializing a model from the Qwen2-7B style configuration + >>> model = Qwen2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "qwen2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=151936, + hidden_size=4096, + intermediate_size=22016, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=32, + hidden_act="silu", + max_position_embeddings=32768, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + tie_word_embeddings=False, + rope_theta=10000.0, + use_sliding_window=False, + sliding_window=4096, + max_window_layers=28, + attention_dropout=0.0, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.use_sliding_window = use_sliding_window + self.sliding_window = sliding_window + self.max_window_layers = max_window_layers + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + + super().__init__( + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen2-1_5B/modeling_qwen2.py new file mode 100644 index 000000000..f8d5b5345 --- /dev/null +++ b/transformers/llm/export/llm_models/Qwen2-1_5B/modeling_qwen2.py @@ -0,0 +1,1434 @@ +# coding=utf-8 +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Qwen2 model.""" +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_qwen2 import Qwen2Config + + +# if is_flash_attn_2_available(): + #from flash_attn import flash_attn_func, flash_attn_varlen_func + #from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + #_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + + +_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta" +_CONFIG_FOR_DOC = "Qwen2Config" + +QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "Qwen/Qwen2-7B-beta", + # See all Qwen2 models at https://huggingface.co/models?filter=qwen2 +] + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2 +class Qwen2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Qwen2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2 +class Qwen2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2 +class Qwen2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class Qwen2Attention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " + "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + self.rotary_emb = Qwen2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + rotary_pos_emb: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[torch.Tensor] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + ''' + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + kv_seq_len += past_key_value[0].shape[2] + if rotary_pos_emb is None: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + else: + cos, sin = rotary_pos_emb + query_states = (query_states * cos) + (rotate_half(query_states) * sin) + key_states = (key_states * cos) + (rotate_half(key_states) * sin) + + if past_key_value is not None: + past_key, past_value = past_key_value[0], past_key_value[1] + key_states = torch.cat((past_key, key_states), dim=2) + value_states = torch.cat((past_value, value_states), dim=2) + # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + past_key_value = torch.stack((key_states, value_states)) + ''' + #--------------- + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + kv_seq_len = key_states.shape[1] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[1] + # rope + cos, sin = rotary_pos_emb + query_states = (query_states * cos) + (rotate_half(query_states) * sin) + key_states = (key_states * cos) + (rotate_half(key_states) * sin) + # kv cache + if past_key_value is not None: + past_key, past_value = past_key_value[0], past_key_value[1] + key_states = torch.cat((past_key, key_states), dim=1) + value_states = torch.cat((past_value, value_states), dim=1) + past_key_value = torch.stack((key_states, value_states)) + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + #--------------- + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class Qwen2FlashAttention2(Qwen2Attention): + """ + Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention` + as the weights of the module stays untouched. The only required change would be on the forward pass + where it needs to correctly call the public API of flash attention and deal with padding tokens + in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom + config.max_window_layers layers. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and self.config.use_sliding_window + ) + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" + " make sure to upgrade flash-attn library." + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Decide whether to use SWA or not by layer index. + if use_sliding_windows and self.layer_idx >= self.config.max_window_layers: + use_sliding_windows = False + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2 +class Qwen2SdpaAttention(Qwen2Attention): + """ + Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from Qwen2Attention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +QWEN2_ATTENTION_CLASSES = { + "eager": Qwen2Attention, + "flash_attention_2": Qwen2FlashAttention2, + "sdpa": Qwen2SdpaAttention, +} + + +class Qwen2DecoderLayer(nn.Module): + def __init__(self, config: Qwen2Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + if config.use_sliding_window and config._attn_implementation != "flash_attention_2": + logger.warning_once( + f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; " + "unexpected results may be encountered." + ) + # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + self.self_attn = Qwen2Attention(config, layer_idx) + + self.mlp = Qwen2MLP(config) + self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + rotary_pos_emb: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + rotary_pos_emb=rotary_pos_emb, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +QWEN2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Qwen2Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.", + QWEN2_START_DOCSTRING, +) +class Qwen2PreTrainedModel(PreTrainedModel): + config_class = Qwen2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Qwen2DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +QWEN2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.", + QWEN2_START_DOCSTRING, +) +class Qwen2Model(Qwen2PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`] + + Args: + config: Qwen2Config + """ + + def __init__(self, config: Qwen2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self._attn_implementation = config._attn_implementation + self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class Qwen2ForCausalLM(Qwen2PreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = Qwen2Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, Qwen2ForCausalLM + + >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + The Qwen2 Model transformer with a sequence classification head on top (linear layer). + + [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + QWEN2_START_DOCSTRING, +) +class Qwen2ForSequenceClassification(Qwen2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Qwen2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/transformers/llm/export/llm_models/Qwen2-7B-Instruct/config.json b/transformers/llm/export/llm_models/Qwen2-7B-Instruct/config.json new file mode 100755 index 000000000..eac7cd285 --- /dev/null +++ b/transformers/llm/export/llm_models/Qwen2-7B-Instruct/config.json @@ -0,0 +1,31 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "auto_map": { + "AutoConfig": "configuration_qwen2.Qwen2Config", + "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM" + }, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.41.2", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} diff --git a/transformers/llm/export/llm_models/Qwen2-7B-Instruct/configuration_qwen2.py b/transformers/llm/export/llm_models/Qwen2-7B-Instruct/configuration_qwen2.py new file mode 100644 index 000000000..b6ca1ed43 --- /dev/null +++ b/transformers/llm/export/llm_models/Qwen2-7B-Instruct/configuration_qwen2.py @@ -0,0 +1,144 @@ +# coding=utf-8 +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Qwen2 model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json", +} + + +class Qwen2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a + Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of + Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 151936): + Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Qwen2Model`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 22016): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 32): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + use_sliding_window (`bool`, *optional*, defaults to `False`): + Whether to use sliding window attention. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention (SWA) window size. If not specified, will default to `4096`. + max_window_layers (`int`, *optional*, defaults to 28): + The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import Qwen2Model, Qwen2Config + + >>> # Initializing a Qwen2 style configuration + >>> configuration = Qwen2Config() + + >>> # Initializing a model from the Qwen2-7B style configuration + >>> model = Qwen2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "qwen2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=151936, + hidden_size=4096, + intermediate_size=22016, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=32, + hidden_act="silu", + max_position_embeddings=32768, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + tie_word_embeddings=False, + rope_theta=10000.0, + use_sliding_window=False, + sliding_window=4096, + max_window_layers=28, + attention_dropout=0.0, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.use_sliding_window = use_sliding_window + self.sliding_window = sliding_window + self.max_window_layers = max_window_layers + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + + super().__init__( + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/transformers/llm/export/llm_models/Qwen2-7B-Instruct/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen2-7B-Instruct/modeling_qwen2.py new file mode 100644 index 000000000..595a3e91c --- /dev/null +++ b/transformers/llm/export/llm_models/Qwen2-7B-Instruct/modeling_qwen2.py @@ -0,0 +1,1436 @@ +# coding=utf-8 +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Qwen2 model.""" +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_qwen2 import Qwen2Config + + +# if is_flash_attn_2_available(): + #from flash_attn import flash_attn_func, flash_attn_varlen_func + #from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + #_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + + +_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta" +_CONFIG_FOR_DOC = "Qwen2Config" + +QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "Qwen/Qwen2-7B-beta", + # See all Qwen2 models at https://huggingface.co/models?filter=qwen2 +] + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2 +class Qwen2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Qwen2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2 +class Qwen2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2 +class Qwen2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class Qwen2Attention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " + "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + self.rotary_emb = Qwen2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + rotary_pos_emb: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[torch.Tensor] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + ''' + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + kv_seq_len += past_key_value[0].shape[2] + if rotary_pos_emb is None: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + else: + cos, sin = rotary_pos_emb + query_states = (query_states * cos) + (rotate_half(query_states) * sin) + key_states = (key_states * cos) + (rotate_half(key_states) * sin) + + if past_key_value is not None: + past_key, past_value = past_key_value[0], past_key_value[1] + key_states = torch.cat((past_key, key_states), dim=2) + value_states = torch.cat((past_value, value_states), dim=2) + # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + past_key_value = torch.stack((key_states, value_states)) + # repeat k/v heads if n_kv_heads < n_heads + # key_states = repeat_kv(key_states, self.num_key_value_groups) + # value_states = repeat_kv(value_states, self.num_key_value_groups) + ''' + #--------------- + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + kv_seq_len = key_states.shape[1] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[1] + # rope + cos, sin = rotary_pos_emb + query_states = (query_states * cos) + (rotate_half(query_states) * sin) + key_states = (key_states * cos) + (rotate_half(key_states) * sin) + # kv cache + if past_key_value is not None: + past_key, past_value = past_key_value[0], past_key_value[1] + key_states = torch.cat((past_key, key_states), dim=1) + value_states = torch.cat((past_value, value_states), dim=1) + past_key_value = torch.stack((key_states, value_states)) + query_states = query_states.transpose(1, 2) + key_states = key_states.permute([0, 2, 3, 1]) + value_states = value_states.transpose(1, 2) + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + #--------------- + attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class Qwen2FlashAttention2(Qwen2Attention): + """ + Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention` + as the weights of the module stays untouched. The only required change would be on the forward pass + where it needs to correctly call the public API of flash attention and deal with padding tokens + in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom + config.max_window_layers layers. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and self.config.use_sliding_window + ) + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" + " make sure to upgrade flash-attn library." + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Decide whether to use SWA or not by layer index. + if use_sliding_windows and self.layer_idx >= self.config.max_window_layers: + use_sliding_windows = False + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2 +class Qwen2SdpaAttention(Qwen2Attention): + """ + Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from Qwen2Attention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +QWEN2_ATTENTION_CLASSES = { + "eager": Qwen2Attention, + "flash_attention_2": Qwen2FlashAttention2, + "sdpa": Qwen2SdpaAttention, +} + + +class Qwen2DecoderLayer(nn.Module): + def __init__(self, config: Qwen2Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + if config.use_sliding_window and config._attn_implementation != "flash_attention_2": + logger.warning_once( + f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; " + "unexpected results may be encountered." + ) + # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + self.self_attn = Qwen2Attention(config, layer_idx) + + self.mlp = Qwen2MLP(config) + self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + rotary_pos_emb: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + rotary_pos_emb=rotary_pos_emb, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +QWEN2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Qwen2Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.", + QWEN2_START_DOCSTRING, +) +class Qwen2PreTrainedModel(PreTrainedModel): + config_class = Qwen2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Qwen2DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +QWEN2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.", + QWEN2_START_DOCSTRING, +) +class Qwen2Model(Qwen2PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`] + + Args: + config: Qwen2Config + """ + + def __init__(self, config: Qwen2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self._attn_implementation = config._attn_implementation + self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class Qwen2ForCausalLM(Qwen2PreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = Qwen2Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, Qwen2ForCausalLM + + >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + The Qwen2 Model transformer with a sequence classification head on top (linear layer). + + [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + QWEN2_START_DOCSTRING, +) +class Qwen2ForSequenceClassification(Qwen2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Qwen2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/transformers/llm/export/llm_models/TinyLlama-1.1B-Chat/config.json b/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/config.json similarity index 100% rename from transformers/llm/export/llm_models/TinyLlama-1.1B-Chat/config.json rename to transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/config.json diff --git a/transformers/llm/export/llm_models/TinyLlama-1.1B-Chat/configuration_llama.py b/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/configuration_llama.py similarity index 100% rename from transformers/llm/export/llm_models/TinyLlama-1.1B-Chat/configuration_llama.py rename to transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/configuration_llama.py diff --git a/transformers/llm/export/llm_models/TinyLlama-1.1B-Chat/modeling_llama.py b/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/modeling_llama.py similarity index 95% rename from transformers/llm/export/llm_models/TinyLlama-1.1B-Chat/modeling_llama.py rename to transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/modeling_llama.py index 8c562c604..493b040b7 100644 --- a/transformers/llm/export/llm_models/TinyLlama-1.1B-Chat/modeling_llama.py +++ b/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/modeling_llama.py @@ -182,8 +182,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids): # sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] cos = torch.squeeze(cos) # [seq_len, dim] sin = torch.squeeze(sin) # [seq_len, dim] - cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] - sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + # cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + # sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed @@ -282,6 +282,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: Optional[torch.Tensor] = None, output_attentions: bool = False, use_cache: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: @@ -306,7 +307,7 @@ def forward( query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - + ''' query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) @@ -314,7 +315,10 @@ def forward( kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + if rotary_pos_emb is None: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + else: + cos, sin = rotary_pos_emb query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: @@ -327,8 +331,32 @@ def forward( # repeat k/v heads if n_kv_heads < n_heads key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + ''' + #--------------- + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + kv_seq_len = key_states.shape[1] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[1] + # rope + cos, sin = rotary_pos_emb + query_states = (query_states * cos) + (rotate_half(query_states) * sin) + key_states = (key_states * cos) + (rotate_half(key_states) * sin) + # kv cache + if past_key_value is not None: + past_key, past_value = past_key_value[0], past_key_value[1] + key_states = torch.cat((past_key, key_states), dim=1) + value_states = torch.cat((past_value, value_states), dim=1) + past_key_value = torch.stack((key_states, value_states)) + query_states = query_states.transpose(1, 2) + key_states = key_states.permute([0, 2, 3, 1]) + value_states = value_states.transpose(1, 2) + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + #--------------- + attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim) if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( @@ -384,6 +412,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: @@ -411,6 +440,7 @@ def forward( attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_value, + rotary_pos_emb=rotary_pos_emb, output_attentions=output_attentions, use_cache=use_cache, ) diff --git a/transformers/llm/export/llm_models/Yi-6B-Chat/modeling_llama.py b/transformers/llm/export/llm_models/Yi-6B-Chat/modeling_llama.py index 8c562c604..493b040b7 100644 --- a/transformers/llm/export/llm_models/Yi-6B-Chat/modeling_llama.py +++ b/transformers/llm/export/llm_models/Yi-6B-Chat/modeling_llama.py @@ -182,8 +182,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids): # sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] cos = torch.squeeze(cos) # [seq_len, dim] sin = torch.squeeze(sin) # [seq_len, dim] - cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] - sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + # cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + # sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed @@ -282,6 +282,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: Optional[torch.Tensor] = None, output_attentions: bool = False, use_cache: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: @@ -306,7 +307,7 @@ def forward( query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - + ''' query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) @@ -314,7 +315,10 @@ def forward( kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + if rotary_pos_emb is None: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + else: + cos, sin = rotary_pos_emb query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: @@ -327,8 +331,32 @@ def forward( # repeat k/v heads if n_kv_heads < n_heads key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + ''' + #--------------- + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + kv_seq_len = key_states.shape[1] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[1] + # rope + cos, sin = rotary_pos_emb + query_states = (query_states * cos) + (rotate_half(query_states) * sin) + key_states = (key_states * cos) + (rotate_half(key_states) * sin) + # kv cache + if past_key_value is not None: + past_key, past_value = past_key_value[0], past_key_value[1] + key_states = torch.cat((past_key, key_states), dim=1) + value_states = torch.cat((past_value, value_states), dim=1) + past_key_value = torch.stack((key_states, value_states)) + query_states = query_states.transpose(1, 2) + key_states = key_states.permute([0, 2, 3, 1]) + value_states = value_states.transpose(1, 2) + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + #--------------- + attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim) if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( @@ -384,6 +412,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: @@ -411,6 +440,7 @@ def forward( attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_value, + rotary_pos_emb=rotary_pos_emb, output_attentions=output_attentions, use_cache=use_cache, ) diff --git a/transformers/llm/export/llm_models/deepseek-llm-7b-chat/modeling_llama.py b/transformers/llm/export/llm_models/deepseek-llm-7b-chat/modeling_llama.py index 8c562c604..493b040b7 100644 --- a/transformers/llm/export/llm_models/deepseek-llm-7b-chat/modeling_llama.py +++ b/transformers/llm/export/llm_models/deepseek-llm-7b-chat/modeling_llama.py @@ -182,8 +182,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids): # sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] cos = torch.squeeze(cos) # [seq_len, dim] sin = torch.squeeze(sin) # [seq_len, dim] - cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] - sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + # cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + # sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed @@ -282,6 +282,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: Optional[torch.Tensor] = None, output_attentions: bool = False, use_cache: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: @@ -306,7 +307,7 @@ def forward( query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - + ''' query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) @@ -314,7 +315,10 @@ def forward( kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + if rotary_pos_emb is None: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + else: + cos, sin = rotary_pos_emb query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: @@ -327,8 +331,32 @@ def forward( # repeat k/v heads if n_kv_heads < n_heads key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + ''' + #--------------- + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + kv_seq_len = key_states.shape[1] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[1] + # rope + cos, sin = rotary_pos_emb + query_states = (query_states * cos) + (rotate_half(query_states) * sin) + key_states = (key_states * cos) + (rotate_half(key_states) * sin) + # kv cache + if past_key_value is not None: + past_key, past_value = past_key_value[0], past_key_value[1] + key_states = torch.cat((past_key, key_states), dim=1) + value_states = torch.cat((past_value, value_states), dim=1) + past_key_value = torch.stack((key_states, value_states)) + query_states = query_states.transpose(1, 2) + key_states = key_states.permute([0, 2, 3, 1]) + value_states = value_states.transpose(1, 2) + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + #--------------- + attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim) if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( @@ -384,6 +412,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: @@ -411,6 +440,7 @@ def forward( attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_value, + rotary_pos_emb=rotary_pos_emb, output_attentions=output_attentions, use_cache=use_cache, ) diff --git a/transformers/llm/export/llm_models/glm-4-9b-chat/modeling_chatglm.py b/transformers/llm/export/llm_models/glm-4-9b-chat/modeling_chatglm.py new file mode 100755 index 000000000..e86f5a2f4 --- /dev/null +++ b/transformers/llm/export/llm_models/glm-4-9b-chat/modeling_chatglm.py @@ -0,0 +1,1238 @@ +""" PyTorch ChatGLM model. """ +import json +import math +import copy +import warnings +import re +import sys + +import torch +import torch.utils.checkpoint +import torch.nn.functional as F +from torch import nn +from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss +from torch.nn.utils import skip_init +from typing import Optional, Tuple, Union, List, Callable, Dict, Any +from copy import deepcopy + +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import logging +from transformers.generation.logits_process import LogitsProcessor +from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput + +from .configuration_chatglm import ChatGLMConfig + +# flags required to enable jit fusion kernels + +if sys.platform != 'darwin': + torch._C._jit_set_profiling_mode(False) + torch._C._jit_set_profiling_executor(False) + torch._C._jit_override_can_fuse_on_cpu(True) + torch._C._jit_override_can_fuse_on_gpu(True) + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM" +_CONFIG_FOR_DOC = "ChatGLMConfig" + +def default_init(cls, *args, **kwargs): + return cls(*args, **kwargs) + + +class InvalidScoreLogitsProcessor(LogitsProcessor): + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + if torch.isnan(scores).any() or torch.isinf(scores).any(): + scores.zero_() + scores[..., 198] = 5e4 + return scores + + +def split_tensor_along_last_dim( + tensor: torch.Tensor, + num_partitions: int, + contiguous_split_chunks: bool = False, +) -> List[torch.Tensor]: + """Split a tensor along its last dimension. + + Arguments: + tensor: input tensor. + num_partitions: number of partitions to split the tensor + contiguous_split_chunks: If True, make each chunk contiguous + in memory. + + Returns: + A list of Tensors + """ + # Get the size and dimension. + last_dim = tensor.dim() - 1 + last_dim_size = tensor.size()[last_dim] // num_partitions + # Split. + tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) + # Note: torch.split does not create contiguous tensors by default. + if contiguous_split_chunks: + return tuple(chunk.contiguous() for chunk in tensor_list) + + return tensor_list + + +class RotaryEmbedding(nn.Module): + def __init__(self, dim, rope_ratio=1, original_impl=False, device=None, dtype=None): + super().__init__() + inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim)) + self.register_buffer("inv_freq", inv_freq) + self.dim = dim + self.original_impl = original_impl + self.rope_ratio = rope_ratio + + def forward_impl( + self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000 + ): + """Enhanced Transformer with Rotary Position Embedding. + + Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/ + transformers/rope/__init__.py. MIT License: + https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license. + """ + # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$ + base = base * self.rope_ratio + theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem)) + + # Create position indexes `[0, 1, ..., seq_len - 1]` + seq_idx = torch.arange(seq_len, dtype=torch.float, device=device) + + # Calculate the product of position index and $\theta_i$ + idx_theta = torch.outer(seq_idx, theta).float() + + cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1) + + # this is to mimic the behaviour of complex32, else we will get different results + if dtype in (torch.float16, torch.bfloat16, torch.int8): + cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half() + return cache + + def forward(self, max_seq_len, offset=0): + return self.forward_impl( + max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device + ) + + +@torch.jit.script +def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor: + # x: [b, np, sq, hn] + b, np, sq, hn = x.size(0), x.size(1), x.size(2), x.size(3) + rot_dim = rope_cache.shape[-2] * 2 + x, x_pass = x[..., :rot_dim], x[..., rot_dim:] + # truncate to support variable sizes + rope_cache = rope_cache[:, :sq] + xshaped = x.reshape(b, np, sq, rot_dim // 2, 2) + rope_cache = rope_cache.view(-1, 1, sq, xshaped.size(3), 2) + x_out2 = torch.stack( + [ + xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1], + xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1], + ], + -1, + ) + x_out2 = x_out2.flatten(3) + return torch.cat((x_out2, x_pass), dim=-1) + + +class RMSNorm(torch.nn.Module): + def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs): + super().__init__() + self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype)) + self.eps = eps + + def forward(self, hidden_states: torch.Tensor): + input_dtype = hidden_states.dtype + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.eps) + + return (self.weight * hidden_states).to(input_dtype) + + +class CoreAttention(torch.nn.Module): + def __init__(self, config: ChatGLMConfig, layer_number): + super(CoreAttention, self).__init__() + + self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32 + if self.apply_query_key_layer_scaling: + self.attention_softmax_in_fp32 = True + self.layer_number = max(1, layer_number) + + projection_size = config.kv_channels * config.num_attention_heads + + # Per attention head and per partition values. + self.hidden_size_per_partition = projection_size + self.hidden_size_per_attention_head = projection_size // config.num_attention_heads + self.num_attention_heads_per_partition = config.num_attention_heads + + coeff = None + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + if self.apply_query_key_layer_scaling: + coeff = self.layer_number + self.norm_factor *= coeff + self.coeff = coeff + + self.attention_dropout = torch.nn.Dropout(config.attention_dropout) + + def raw_atten(self, query_layer, key_layer, value_layer, attention_mask): + attn_weights = torch.matmul(query_layer, key_layer.transpose(-1, -2)) / self.norm_factor + if attention_mask is None: + seq_len = query_layer.shape[2] + attention_mask = ~torch.tril(torch.ones([1, 1, seq_len, seq_len], device=attn_weights.device).bool()) + attn_weights = attn_weights.masked_fill(attention_mask, float("-inf")) + #mask_value = torch.finfo(attn_weights.dtype).min + #attn_weights = torch.where(attention_mask, attn_weights.to(attn_weights.dtype), mask_value) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + context_layer = torch.matmul(attn_weights, value_layer) + return context_layer + context_layer = context_layer.transpose(1, 2).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) + context_layer = context_layer.reshape(*new_context_layer_shape) + return context_layer + + def forward(self, query_layer, key_layer, value_layer, attention_mask): + pytorch_major_version = int(torch.__version__.split('.')[0]) + if pytorch_major_version >= 2 and False: + if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]: + context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, + is_causal=True) + else: + if attention_mask is not None: + attention_mask = ~attention_mask + context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, + attention_mask) + context_layer = context_layer.transpose(1, 2).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) + context_layer = context_layer.reshape(*new_context_layer_shape) + else: + # Raw attention scores + + # [b, np, sq, sk] + output_size = (query_layer.size(0), query_layer.size(1), query_layer.size(2), key_layer.size(2)) + + # [b, np, sq, hn] -> [b * np, sq, hn] + query_layer = query_layer.view(output_size[0] * output_size[1], output_size[2], -1) + # [b, np, sk, hn] -> [b * np, sk, hn] + key_layer = key_layer.view(output_size[0] * output_size[1], output_size[3], -1) + + # preallocting input tensor: [b * np, sq, sk] + matmul_input_buffer = torch.empty( + output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype, + device=query_layer.device + ) + + # Raw attention scores. [b * np, sq, sk] + matmul_result = torch.baddbmm( + matmul_input_buffer, + query_layer, # [b * np, sq, hn] + key_layer.transpose(1, 2), # [b * np, hn, sk] + beta=0.0, + alpha=(1.0 / self.norm_factor), + ) + + # change view to [b, np, sq, sk] + attention_scores = matmul_result.view(*output_size) + + # =========================== + # Attention probs and dropout + # =========================== + + # attention scores and attention mask [b, np, sq, sk] + if self.attention_softmax_in_fp32: + attention_scores = attention_scores.float() + if self.coeff is not None: + attention_scores = attention_scores * self.coeff + if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]: + attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3], + device=attention_scores.device, dtype=torch.bool) + attention_mask.tril_() + attention_mask = ~attention_mask + + if attention_mask is not None: + attention_scores = attention_scores.masked_fill(attention_mask, float("-inf")) + attention_probs = F.softmax(attention_scores, dim=-1) + attention_probs = attention_probs.type_as(value_layer) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.attention_dropout(attention_probs) + # ========================= + # Context layer. [sq, b, hp] + # ========================= + + # value_layer -> context layer. + # [sk, b, np, hn] --> [b, np, sq, hn] + + # context layer shape: [b, np, sq, hn] + output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3)) + # change view [b * np, sk, hn] + #value_layer = value_layer.view(output_size[0] * output_size[1], value_layer.size(2), -1) + # change view [b * np, sq, sk] + #attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1) + # matmul: [b * np, sq, hn] + # context_layer = torch.bmm(attention_probs, value_layer) + context_layer = torch.matmul(attention_probs, value_layer) + # change view [b, np, sq, hn] + # context_layer = context_layer.view(*output_size) + # [b, np, sq, hn] --> [b, sq, np, hn] + context_layer = context_layer.transpose(1, 2).contiguous() + # [b, sq, np, hn] --> [b, sq, hp] + new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) + context_layer = context_layer.reshape(*new_context_layer_shape) + + return context_layer + + +class SelfAttention(torch.nn.Module): + """Parallel self-attention layer abstract class. + + Self-attention layer takes input with size [s, b, h] + and returns output of the same size. + """ + + def __init__(self, config: ChatGLMConfig, layer_number, device=None): + super(SelfAttention, self).__init__() + self.layer_number = max(1, layer_number) + + self.projection_size = config.kv_channels * config.num_attention_heads + + # Per attention head and per partition values. + self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads + self.num_attention_heads_per_partition = config.num_attention_heads + + self.multi_query_attention = config.multi_query_attention + self.qkv_hidden_size = 3 * self.projection_size + if self.multi_query_attention: + self.num_multi_query_groups_per_partition = config.multi_query_group_num + self.qkv_hidden_size = ( + self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num + ) + self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size, + bias=config.add_bias_linear or config.add_qkv_bias, + device=device, **_config_to_kwargs(config) + ) + + self.core_attention = CoreAttention(config, self.layer_number) + + # Output. + self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear, + device=device, **_config_to_kwargs(config) + ) + + def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None): + if self.multi_query_attention: + num_attention_heads = self.num_multi_query_groups_per_partition + else: + num_attention_heads = self.num_attention_heads_per_partition + return torch.empty( + inference_max_sequence_len, + batch_size, + num_attention_heads, + self.hidden_size_per_attention_head, + dtype=dtype, + device=device, + ) + + def forward(self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True): + # hidden_states: [b, sq, h] + + # ================================================= + # Pre-allocate memory for key-values for inference. + # ================================================= + # ===================== + # Query, Key, and Value + # ===================== + + # Attention heads [b, sq, h] --> [b, sq, (np * 3 * hn)] + mixed_x_layer = self.query_key_value(hidden_states) + + if self.multi_query_attention: + (query_layer, key_layer, value_layer) = mixed_x_layer.split( + [ + self.num_attention_heads_per_partition * self.hidden_size_per_attention_head, + self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, + self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, + ], + dim=-1, + ) + query_layer = query_layer.view( + query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) + ) + key_layer = key_layer.view( + key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) + ) + value_layer = value_layer.view( + value_layer.size()[:-1] + + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) + ) + else: + new_tensor_shape = mixed_x_layer.size()[:-1] + \ + (self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head) + mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) + + # [b, sq, np, 3 * hn] --> 3 [b, sq, np, hn] + (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) + + # [b, sq, np, hn] -> [b, np, sq, hn] + query_layer, key_layer, value_layer = [k.transpose(1, 2) for k in [query_layer, key_layer, value_layer]] + + # apply relative positional encoding (rotary embedding) + if rotary_pos_emb is not None: + query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb) + key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb) + + # adjust key and value for inference + if kv_cache is not None: + cache_k, cache_v = kv_cache + key_layer = torch.cat((cache_k, key_layer), dim=2) + value_layer = torch.cat((cache_v, value_layer), dim=2) + if use_cache: + ''' + if kv_cache is None: + kv_cache = torch.cat((key_layer.unsqueeze(0).unsqueeze(0), value_layer.unsqueeze(0).unsqueeze(0)), dim=1) + else: + kv_cache = (key_layer, value_layer) + ''' + kv_cache = torch.stack([key_layer, value_layer], axis=0) + # ''' + else: + kv_cache = None + + if self.multi_query_attention: + key_layer = key_layer.unsqueeze(2) + key_layer = key_layer.expand( + -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1, -1 + ) + key_layer = key_layer.contiguous().view( + key_layer.size()[:1] + (self.num_attention_heads_per_partition,) + key_layer.size()[3:] + ) + value_layer = value_layer.unsqueeze(2) + value_layer = value_layer.expand( + -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1, -1 + ) + value_layer = value_layer.contiguous().view( + value_layer.size()[:1] + (self.num_attention_heads_per_partition,) + value_layer.size()[3:] + ) + + # ================================== + # core attention computation + # ================================== + + context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask) + + # ================= + # Output. [sq, b, h] + # ================= + + output = self.dense(context_layer) + + return output, kv_cache + + +def _config_to_kwargs(args): + common_kwargs = { + "dtype": args.torch_dtype, + } + return common_kwargs + + +class MLP(torch.nn.Module): + """MLP. + + MLP will take the input with h hidden state, project it to 4*h + hidden dimension, perform nonlinear transformation, and project the + state back into h hidden dimension. + """ + + def __init__(self, config: ChatGLMConfig, device=None): + super(MLP, self).__init__() + + self.add_bias = config.add_bias_linear + + # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf + self.dense_h_to_4h = nn.Linear( + config.hidden_size, + config.ffn_hidden_size * 2, + bias=self.add_bias, + device=device, + **_config_to_kwargs(config) + ) + + def swiglu(x): + x = torch.chunk(x, 2, dim=-1) + return F.silu(x[0]) * x[1] + + self.activation_func = swiglu + + # Project back to h. + self.dense_4h_to_h = nn.Linear( + config.ffn_hidden_size, + config.hidden_size, + bias=self.add_bias, + device=device, + **_config_to_kwargs(config) + ) + + def forward(self, hidden_states): + # [s, b, 4hp] + intermediate_parallel = self.dense_h_to_4h(hidden_states) + intermediate_parallel = self.activation_func(intermediate_parallel) + # [s, b, h] + output = self.dense_4h_to_h(intermediate_parallel) + return output + + +class GLMBlock(torch.nn.Module): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + """ + + def __init__(self, config: ChatGLMConfig, layer_number, device=None): + super(GLMBlock, self).__init__() + self.layer_number = layer_number + + self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm + + self.fp32_residual_connection = config.fp32_residual_connection + + LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm + # Layernorm on the input data. + self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, + dtype=config.torch_dtype) + + # Self attention. + self.self_attention = SelfAttention(config, layer_number, device=device) + self.hidden_dropout = config.hidden_dropout + + # Layernorm on the attention output + self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, + dtype=config.torch_dtype) + + # MLP + self.mlp = MLP(config, device=device) + + def forward( + self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True, + ): + # hidden_states: [s, b, h] + hidden_states = hidden_states.view(1, -1, 4096) + # Layer norm at the beginning of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + # Self attention. + attention_output, kv_cache = self.self_attention( + layernorm_output, + attention_mask, + rotary_pos_emb, + kv_cache=kv_cache, + use_cache=use_cache + ) + + # Residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = hidden_states + + layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training) + layernorm_input = residual + layernorm_input + + # Layer norm post the self attention. + layernorm_output = self.post_attention_layernorm(layernorm_input) + + # MLP. + mlp_output = self.mlp(layernorm_output) + + # Second residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = layernorm_input + + output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training) + output = residual + output + + return output, kv_cache + + +class GLMTransformer(torch.nn.Module): + """Transformer class.""" + + def __init__(self, config: ChatGLMConfig, device=None): + super(GLMTransformer, self).__init__() + + self.fp32_residual_connection = config.fp32_residual_connection + self.post_layer_norm = config.post_layer_norm + + # Number of layers. + self.num_layers = config.num_layers + + # Transformer layers. + def build_layer(layer_number): + return GLMBlock(config, layer_number, device=device) + + self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)]) + + if self.post_layer_norm: + LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm + # Final layer norm before output. + self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, + dtype=config.torch_dtype) + + self.gradient_checkpointing = False + + def _get_layer(self, layer_number): + return self.layers[layer_number] + + def forward( + self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None, + use_cache: Optional[bool] = True, + output_hidden_states: Optional[bool] = False, + ): + if not kv_caches: + kv_caches = [None for _ in range(self.num_layers)] + presents = () if use_cache else None + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + all_self_attentions = None + all_hidden_states = () if output_hidden_states else None + for index in range(self.num_layers): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer = self._get_layer(index) + if self.gradient_checkpointing and self.training: + layer_ret = torch.utils.checkpoint.checkpoint( + layer, + hidden_states, + attention_mask, + rotary_pos_emb, + kv_caches[index], + use_cache, + use_reentrant=False + ) + else: + layer_ret = layer( + hidden_states, + attention_mask, + rotary_pos_emb, + kv_cache=kv_caches[index], + use_cache=use_cache + ) + hidden_states, kv_cache = layer_ret + if use_cache: + # token by token decoding, use tuple format + if kv_caches[0] is not None: + presents = presents + (kv_cache,) + # prefilling in decoding, use tensor format to save cuda memory + else: + if len(presents) == 0: + presents = kv_cache + else: + presents = torch.cat((presents, kv_cache), dim=0) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # Final layer norm. + if self.post_layer_norm: + hidden_states = self.final_layernorm(hidden_states) + + return hidden_states, presents, all_hidden_states, all_self_attentions + + +class ChatGLMPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and + a simple interface for downloading and loading pretrained models. + """ + + is_parallelizable = False + supports_gradient_checkpointing = True + config_class = ChatGLMConfig + base_model_prefix = "transformer" + _no_split_modules = ["GLMBlock"] + + def _init_weights(self, module: nn.Module): + """Initialize the weights.""" + return + + def get_masks(self, input_ids, past_key_values, padding_mask=None): + batch_size, seq_length = input_ids.shape + full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device) + full_attention_mask.tril_() + past_length = 0 + if past_key_values: + past_length = past_key_values[0][0].shape[2] + if past_length: + full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length, + device=input_ids.device), full_attention_mask), dim=-1) + if padding_mask is not None: + full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1) + if not past_length and padding_mask is not None: + full_attention_mask -= padding_mask.unsqueeze(-1) - 1 + full_attention_mask = (full_attention_mask < 0.5).bool() + full_attention_mask.unsqueeze_(1) + return full_attention_mask + + def get_position_ids(self, input_ids, device): + batch_size, seq_length = input_ids.shape + position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1) + return position_ids + + def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None): + if not self.supports_gradient_checkpointing: + raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.") + + +class Embedding(torch.nn.Module): + """Language model embeddings.""" + + def __init__(self, config: ChatGLMConfig, device=None): + super(Embedding, self).__init__() + + self.hidden_size = config.hidden_size + # Word embeddings (parallel). + self.word_embeddings = nn.Embedding( + config.padded_vocab_size, + self.hidden_size, + dtype=config.torch_dtype, + device=device + ) + self.fp32_residual_connection = config.fp32_residual_connection + + def forward(self, input_ids): + # Embeddings. + words_embeddings = self.word_embeddings(input_ids) + embeddings = words_embeddings + # If the input flag for fp32 residual connection is set, convert for float. + if self.fp32_residual_connection: + embeddings = embeddings.float() + return embeddings + + +class ChatGLMModel(ChatGLMPreTrainedModel): + def __init__(self, config: ChatGLMConfig, device=None, empty_init=True): + super().__init__(config) + if empty_init: + init_method = skip_init + else: + init_method = default_init + init_kwargs = {} + if device is not None: + init_kwargs["device"] = device + self.embedding = init_method(Embedding, config, **init_kwargs) + self.num_layers = config.num_layers + self.multi_query_group_num = config.multi_query_group_num + self.kv_channels = config.kv_channels + + # Rotary positional embeddings + self.seq_length = config.seq_length + rotary_dim = ( + config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels + ) + + self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio, original_impl=config.original_rope, + device=device, dtype=config.torch_dtype) + self.encoder = init_method(GLMTransformer, config, **init_kwargs) + self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False, + dtype=config.torch_dtype, **init_kwargs) + + def get_input_embeddings(self): + return self.embedding.word_embeddings + + def set_input_embeddings(self, value): + self.embedding.word_embeddings = value + + def forward( + self, + input_ids, + position_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.BoolTensor] = None, + full_attention_mask: Optional[torch.BoolTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, seq_length = input_ids.shape + + if inputs_embeds is None: + inputs_embeds = self.embedding(input_ids) + + if full_attention_mask is None: + if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1): + full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask) + + # Rotary positional embeddings + rotary_pos_emb = self.rotary_pos_emb(self.seq_length) + if position_ids is not None: + rotary_pos_emb = rotary_pos_emb[position_ids] + else: + rotary_pos_emb = rotary_pos_emb[None, :seq_length] + + # Run encoder. + hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( + inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb, + kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states + ) + if presents is not None and type(presents) is torch.Tensor: + presents = presents.split(1, dim=0) + presents = list(presents) + presents = [list(x.squeeze(0).split(1, dim=0)) for x in presents] + presents = [tuple([x.squeeze(0) for x in y]) for y in presents] + presents = tuple(presents) + + if not return_dict: + return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel): + def __init__(self, config: ChatGLMConfig, empty_init=True, device=None): + super().__init__(config) + + self.max_sequence_length = config.max_length + self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device) + self.config = config + + def _update_model_kwargs_for_generation( + self, + outputs: ModelOutput, + model_kwargs: Dict[str, Any], + is_encoder_decoder: bool = False, + standardize_cache_format: bool = False, + ) -> Dict[str, Any]: + # update past_key_values + model_kwargs["past_key_values"] = self._extract_past_from_model_output( + outputs, standardize_cache_format=standardize_cache_format + ) + + # update attention mask + if "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = torch.cat( + [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + ) + + # update position ids + if "position_ids" in model_kwargs: + position_ids = model_kwargs["position_ids"] + new_position_id = position_ids[..., -1:].clone() + new_position_id += 1 + model_kwargs["position_ids"] = torch.cat( + [position_ids, new_position_id], dim=-1 + ) + + model_kwargs["is_first_forward"] = False + return model_kwargs + + def prepare_inputs_for_generation( + self, + input_ids: torch.LongTensor, + past_key_values: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + is_first_forward: bool = True, + **kwargs + ) -> dict: + # only last token for input_ids if past is not None + if position_ids is None: + position_ids = self.get_position_ids(input_ids, device=input_ids.device) + if not is_first_forward: + if past_key_values is not None: + position_ids = position_ids[..., -1:] + input_ids = input_ids[:, -1:] + return { + "input_ids": input_ids, + "past_key_values": past_key_values, + "position_ids": position_ids, + "attention_mask": attention_mask, + "return_last_logit": True, + "use_cache": use_cache + } + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + return_last_logit: Optional[bool] = False, + ): + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + if return_last_logit: + hidden_states = hidden_states[:, -1:] + lm_logits = self.transformer.output_layer(hidden_states) + + loss = None + if labels is not None: + lm_logits = lm_logits.to(torch.float32) + + # Shift so that tokens < n predict n + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss(ignore_index=-100) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + lm_logits = lm_logits.to(hidden_states.dtype) + loss = loss.to(hidden_states.dtype) + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + @staticmethod + def _reorder_cache( + past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor + ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]: + """ + This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or + [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct + beam_idx at every generation step. + + Output shares the same memory storage as `past`. + """ + return tuple( + ( + layer_past[0].index_select(0, beam_idx.to(layer_past[0].device)), + layer_past[1].index_select(0, beam_idx.to(layer_past[1].device)), + ) + for layer_past in past + ) + + def process_response(self, output, history): + content = "" + history = deepcopy(history) + for response in output.split("<|assistant|>"): + if "\n" in response: + metadata, content = response.split("\n", maxsplit=1) + else: + metadata, content = "", response + if not metadata.strip(): + content = content.strip() + history.append({"role": "assistant", "metadata": metadata, "content": content}) + content = content.replace("[[训练时间]]", "2023年") + else: + history.append({"role": "assistant", "metadata": metadata, "content": content}) + if history[0]["role"] == "system" and "tools" in history[0]: + parameters = json.loads(content) + content = {"name": metadata.strip(), "parameters": parameters} + else: + content = {"name": metadata.strip(), "content": content} + return content, history + + @torch.inference_mode() + def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", + max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, + **kwargs): + if history is None: + history = [] + if logits_processor is None: + logits_processor = LogitsProcessorList() + logits_processor.append(InvalidScoreLogitsProcessor()) + gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, + "temperature": temperature, "logits_processor": logits_processor, **kwargs} + history.append({"role": role, "content": query}) + inputs = tokenizer.apply_chat_template(history, add_generation_prompt=True, tokenize=True, + return_tensors="pt", return_dict=True) + inputs = inputs.to(self.device) + eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|user|>"), + tokenizer.convert_tokens_to_ids("<|observation|>")] + outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id) + outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1] + response = tokenizer.decode(outputs) + response, history = self.process_response(response, history) + return response, history + + @torch.inference_mode() + def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", + past_key_values=None, max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8, + logits_processor=None, return_past_key_values=False, **kwargs): + if history is None: + history = [] + if logits_processor is None: + logits_processor = LogitsProcessorList() + logits_processor.append(InvalidScoreLogitsProcessor()) + eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|user|>"), + tokenizer.convert_tokens_to_ids("<|observation|>")] + gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p, + "temperature": temperature, "logits_processor": logits_processor, **kwargs} + if past_key_values is None: + inputs = tokenizer.apply_chat_template(history + [{"role": role, "content": query}], + add_generation_prompt=True, tokenize=True, return_tensors="pt", + return_dict=True) + else: + inputs = tokenizer.apply_chat_template([{"role": role, "content": query}], add_special_tokens=False, + add_generation_prompt=True, tokenize=True, return_tensors="pt", + return_dict=True) + inputs = inputs.to(self.device) + if past_key_values is not None: + past_length = past_key_values[0][0].shape[2] + inputs.position_ids += past_length + attention_mask = inputs.attention_mask + attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1) + inputs['attention_mask'] = attention_mask + history.append({"role": role, "content": query}) + for outputs in self.stream_generate(**inputs, past_key_values=past_key_values, + eos_token_id=eos_token_id, return_past_key_values=return_past_key_values, + **gen_kwargs): + if return_past_key_values: + outputs, past_key_values = outputs + outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1] + response = tokenizer.decode(outputs) + if response and response[-1] != "�": + response, new_history = self.process_response(response, history) + if return_past_key_values: + yield response, new_history, past_key_values + else: + yield response, new_history + + @torch.inference_mode() + def stream_generate( + self, + input_ids, + generation_config: Optional[GenerationConfig] = None, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, + return_past_key_values=False, + **kwargs, + ): + batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1] + + if generation_config is None: + generation_config = self.generation_config + generation_config = copy.deepcopy(generation_config) + model_kwargs = generation_config.update(**kwargs) + model_kwargs["use_cache"] = generation_config.use_cache + bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id + + if isinstance(eos_token_id, int): + eos_token_id = [eos_token_id] + eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None + + has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None + if has_default_max_length and generation_config.max_new_tokens is None: + warnings.warn( + f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. " + "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we" + " recommend using `max_new_tokens` to control the maximum length of the generation.", + UserWarning, + ) + elif generation_config.max_new_tokens is not None: + generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length + if not has_default_max_length: + logger.warn( + f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(=" + f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. " + "Please refer to the documentation for more information. " + "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)", + UserWarning, + ) + + if input_ids_seq_length >= generation_config.max_length: + input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" + logger.warning( + f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to" + f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider" + " increasing `max_new_tokens`." + ) + + # 2. Set generation parameters if not already defined + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + + logits_processor = self._get_logits_processor( + generation_config=generation_config, + input_ids_seq_length=input_ids_seq_length, + encoder_input_ids=input_ids, + prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, + logits_processor=logits_processor, + ) + + stopping_criteria = self._get_stopping_criteria( + generation_config=generation_config, stopping_criteria=stopping_criteria + ) + logits_warper = self._get_logits_warper(generation_config) + + unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) + scores = None + while True: + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + # forward pass to get next token + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=False, + output_hidden_states=False, + ) + + next_token_logits = outputs.logits[:, -1, :] + + # pre-process distribution + next_token_scores = logits_processor(input_ids, next_token_logits) + next_token_scores = logits_warper(input_ids, next_token_scores) + + # sample + probs = nn.functional.softmax(next_token_scores, dim=-1) + if generation_config.do_sample: + next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) + else: + next_tokens = torch.argmax(probs, dim=-1) + # update generated ids, model inputs, and length for next step + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + model_kwargs = self._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder + ) + unfinished_sequences = unfinished_sequences.mul( + next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) + ) + if return_past_key_values: + yield input_ids, outputs.past_key_values + else: + yield input_ids + # stop when each sentence is finished, or if we exceed the maximum length + if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): + break + + +class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel): + def __init__(self, config: ChatGLMConfig, empty_init=True, device=None): + super().__init__(config) + + self.num_labels = config.num_labels + self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device) + + self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half) + if config.classifier_dropout is not None: + self.dropout = nn.Dropout(config.classifier_dropout) + else: + self.dropout = None + self.config = config + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + full_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + inputs_embeds: Optional[torch.LongTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + full_attention_mask=full_attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + pooled_hidden_states = hidden_states[-1] + if self.dropout is not None: + pooled_hidden_states = self.dropout(pooled_hidden_states) + logits = self.classifier_head(pooled_hidden_states) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze().float(), labels.squeeze()) + else: + loss = loss_fct(logits.float(), labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits.float(), labels.view(-1, self.num_labels)) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/transformers/llm/export/llm_models/internlm-chat-7b/modeling_internlm.py b/transformers/llm/export/llm_models/internlm-chat-7b/modeling_internlm.py index af9f5842c..b636e8716 100755 --- a/transformers/llm/export/llm_models/internlm-chat-7b/modeling_internlm.py +++ b/transformers/llm/export/llm_models/internlm-chat-7b/modeling_internlm.py @@ -147,8 +147,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids): # sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] cos = torch.squeeze(cos) # [seq_len, dim] sin = torch.squeeze(sin) # [seq_len, dim] - cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] - sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + # cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + # sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed @@ -202,11 +202,12 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: Optional[torch.Tensor] = None, output_attentions: bool = False, use_cache: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() - + ''' query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) @@ -214,7 +215,10 @@ def forward( kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + if rotary_pos_emb is None: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + else: + cos, sin = rotary_pos_emb query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) # [bsz, nh, t, hd] @@ -226,7 +230,30 @@ def forward( past_key_value = (key_states, value_states) if use_cache else None attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + ''' + #--------------- + query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim) + key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim) + value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim) + kv_seq_len = key_states.shape[1] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[1] + # rope + cos, sin = rotary_pos_emb + query_states = (query_states * cos) + (rotate_half(query_states) * sin) + key_states = (key_states * cos) + (rotate_half(key_states) * sin) + # kv cache + if past_key_value is not None: + past_key, past_value = past_key_value[0], past_key_value[1] + key_states = torch.cat((past_key, key_states), dim=1) + value_states = torch.cat((past_value, value_states), dim=1) + past_key_value = torch.stack((key_states, value_states)) + query_states = query_states.transpose(1, 2) + key_states = key_states.permute([0, 2, 3, 1]) + value_states = value_states.transpose(1, 2) + attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim) + #--------------- if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" @@ -239,7 +266,7 @@ def forward( f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" ) attn_weights = attn_weights + attention_mask - attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) + # attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) # upcast attention to fp32 attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) @@ -281,6 +308,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: @@ -308,6 +336,7 @@ def forward( attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_value, + rotary_pos_emb=rotary_pos_emb, output_attentions=output_attentions, use_cache=use_cache, )