Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[dist]add benchmark scripts check #2633

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 162 additions & 0 deletions distributed/benchmark_scripts/check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#!/usr/bin/env bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
####################################
export dir_name=$1 # /path/to/demo-model_name

# 判断shellcheck是否安装
echo "==============shellcheck=============="
if command -v shellcheck &> /dev/null; then
echo "shellcheck is installed"
else
echo "shellcheck is not installed, install it now"
echo "apt install shellcheck or yum install shellcheck, exit 1"
exit 1
fi

check_case_name(){
file=$1
echo "=========校验文件名称格式========="
file_temp=${file##*/}
file_name=${file_temp%.sh}
file_model_item=${file_name%%_bs*}
file_global_batch_size=$(echo "$file_name" | grep -oP '_bs\K\d+')
file_fp_item=$(echo "$file_name" | grep -oP 'bf[^_]*|fp[^_]*')
file_run_mode=$(echo "$file_name" | sed 's/.*_//')

model_item=$(cat $file|grep -oP 'model_item=\K[^"]*' | sed 's/ *$//')
global_batch_size=$(cat $file|grep -oP 'global_batch_size=\K\d+' | sed 's/ *$//')
fp_item=$(cat $file|grep -oP 'fp_item=\K[^"]*' | sed 's/ *$//')
run_mode=$(cat $file|grep -oP 'run_mode=\K[^"]*' | sed 's/ *$//')
model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}

if [[ $file_name != "$model_name" ]]; then
echo "异常退出,文件名与model_name拼接结果不一致!!!文件名:$file_name, model_name拼接结果:$model_name"
exit 1
fi
if [[ $file_model_item != $model_item ]]; then
echo "异常退出,model_item不一致!!!文件名的model_item:$file_model_item, 文件名内容的model_item:$model_item"
exit 1
fi
if [[ $file_global_batch_size != $global_batch_size ]]; then
echo "异常退出,global_batch_size不一致!!!文件名的global_batch_size:$file_global_batch_size, 文件名内容的global_batch_size:$global_batch_size"
exit 1
fi
# fp_item为空时,不校验
if [[ $file_fp_item != "$fp_item" ]] && [[ $fp_item != null ]]; then
echo "异常退出,fp_item不一致!!!文件名的fp_item:$file_fp_item, 文件名内容的fp_item:$fp_item"
exit 1
fi
if [[ $file_run_mode != "$run_mode" ]]; then
echo "异常退出,run_mode不一致!!!文件名的run_mode:$file_run_mode, 文件名内容的run_mode:$run_mode"
exit 1
fi
echo "=========校验文件名称格式成功========="
}
check_param_mode(){
file=$1
file_content=$(cat "$file")
if [[ $file_content == *'param+='* ]]; then
echo "匹配param模式校验"
if grep -qE '^param="[^"]+' "$file"; then
echo "param=存在"
else
echo "异常退出,文件内容不包含'param='"
exit 1
fi
# 遍历文件的每一行,确认包括param的行为空格结束
while IFS= read -r line; do
# 检查行是否包含 param+=
if [[ "$line" == *"param+="* ]] || [[ "$line" == *"param="* ]]; then
# 检查行是否包含空格和双引号
if [[ "$line" == *" "* ]] && [[ "$line" == *'"'* ]]; then
continue
else
echo "异常退出, param=|+=结尾不以空格和双引号结尾:$line"
exit 1
fi
fi
done < "$file"
echo "param=|+=均以空格和双引号结尾,符合预期"
fi
}
check_run_mode(){
file=$1
run_mode=$(cat $file|grep -oP 'run_mode=\K[^"]*' | sed 's/ *$//')
run_benchmark=$(dirname "$file")/../benchmark_common/run_benchmark.sh
if grep -qE "{run_mode} in" "$run_benchmark"; then
echo "匹配run_mode模式校验"
if grep -qE "$run_mode" "$run_benchmark"; then
echo "run_mode匹配成功"
else
echo "异常退出,文件内容不包含$run_mode"
exit 1
fi
fi
}

check_args(){
file=$1
if ! grep -qP '^model_item=[^/]*$' "$file"; then
echo "异常退出,文件内容不包含model_item或model_item包含/字符"
exit 1
fi
if ! grep -qP '^global_batch_size=' "$file"; then
echo "异常退出,文件内容不包含global_batch_size"
exit 1
fi
if ! grep -qP '^fp_item=' "$file"; then
echo "异常退出,文件内容不包含fp_item"
exit 1
fi
if ! grep -qP '^run_mode=' "$file"; then
echo "异常退出,文件内容不包含run_mode"
exit 1
fi
model_name_str=$(cat $file|grep -oP 'model_item=\K[^"]*' | sed 's/ *$//')
}

# 定义递归函数,用于遍历文件夹
traverse_folder() {
local folder=$1

# 遍历当前文件夹下的所有文件和子文件夹
for item in "$folder"/*; do
# 检查是否为文件夹
if [[ -d "$item" ]]; then
traverse_folder "$item" # 递归调用自身,遍历子文件夹
else
echo "文件: ${item}"
# shellcheck
shellcheck --format=gcc ${item} | grep -v '^$' | grep -v '^#'| grep error
if [[ ${item} == *"N"*"C"*".sh" ]]; then
# 校验文件名称格式
check_case_name ${item}
# 校验param模式
check_param_mode ${item}
# 校验run_mode匹配
check_run_mode ${item}
elif [[ ${item} == *"run_benchmark.sh" ]]; then
check_args ${item}
else
echo "其他脚本人工check"
fi

fi
done
}

# 调用递归函数,开始遍历
traverse_folder "$dir_name"
117 changes: 117 additions & 0 deletions distributed/benchmark_scripts/generate/generate_sh.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import yaml
import os
import sys
# 已完成benchmark_common及其中一个case编写,适配generate_sh.py,根据case配置更新params.yaml
# 执行位置在该模型套件的根目录:cd /path/to/repo && python /path/to/generate_sh.py paddle /path/to/params.yaml
def generate_paddle_case(arg_params):
test_case_template = """
#!/usr/bin/env bash
param="model_item={model_item_temp} "
param+="global_batch_size={global_batch_size_temp} "
param+="fp_item={fp_item_temp} "
param+="run_mode={run_mode_temp} "
param+="device_num={device_num_temp} "
param+="micro_batch_size={micro_batch_size_temp} "
# 以下为run_benchmark.sh的可选参数
param+="dp_degree={dp_degree_temp} "
param+="mp_degree={mp_degree_temp} "
param+="pp_degree={pp_degree_temp} "
param+="sharding_degree={sharding_degree_temp} "
param+="sharding_stage={sharding_stage_temp} "
param+="level={level_temp} "
param+="local_batch_size={local_batch_size_temp} "
param+="workerlog_id={workerlog_id_temp} "

cd ./benchmarks
# get data
bash {model_item_script_path_temp}/benchmark_common/prepare.sh
# run
bash -c "${{param}} bash {model_item_script_path_temp}/benchmark_common/run_benchmark.sh"
"""

# 分割参数值并为每个组合生成测试用例
print(arg_params)
model_item=arg_params['model_item']
base_batch_size=arg_params['global_batch_size']
fp_item=arg_params['fp_item']
run_mode=arg_params['run_mode']
device_num=arg_params['device_num']
test_case = test_case_template.format(
model_item_temp = model_item,
global_batch_size_temp = base_batch_size,
fp_item_temp = fp_item,
run_mode_temp = run_mode,
device_num_temp = device_num,
micro_batch_size_temp=arg_params['micro_batch_size'],
dp_degree_temp=arg_params['dp_degree'],
mp_degree_temp=arg_params['mp_degree'],
pp_degree_temp=arg_params['pp_degree'],
sharding_degree_temp=arg_params['sharding_degree'],
sharding_stage_temp=arg_params['sharding_stage'],
level_temp=arg_params['level'],
local_batch_size_temp=arg_params['local_batch_size'],
workerlog_id_temp=arg_params['workerlog_id'],
model_item_script_path_temp=arg_params['model_item_script_path'],
)

# 创建目录
os.makedirs(os.path.join(arg_params['benchmark_path'], arg_params['model_item_script_path'], device_num), exist_ok=True)
with open(os.path.join(arg_params['benchmark_path'], arg_params['model_item_script_path'], \
device_num, f'{model_item}_bs{base_batch_size}_{fp_item}_{run_mode}.sh'), 'w') as f:
f.write(test_case)



def generate_pytorch_case(arg_params):
test_case_template = """
#!/usr/bin/env bash
model_item={model_item_temp}
bs_item={base_batch_size_temp}
fp_item={fp_item_temp}
run_mode={run_mode_temp}
device_num={device_num_temp}
max_iter={max_iter_temp}
num_workers={num_workers_temp}
# get data
bash prepare.sh
# run
bash run_benchmark.sh ${{model_item}} ${{bs_item}} ${{fp_item}} ${{run_mode}} ${{device_num}} ${{max_iter}} ${{num_workers}} 2>&1;
"""

# 分割参数值并为每个组合生成测试用例
print(arg_params)
model_item = arg_params['model_item']
base_batch_size = arg_params['base_batch_size']
fp_item = arg_params['fp_item']
run_mode = arg_params['run_mode']
device_num = arg_params['device_num']
max_iter = arg_params['max_iter']
num_workers = arg_params['num_workers']
test_case = test_case_template.format(
model_item_temp=model_item,
base_batch_size_temp=base_batch_size,
fp_item_temp=fp_item,
run_mode_temp=run_mode,
device_num_temp=device_num,
max_iter_temp=max_iter,
num_workers_temp=num_workers,
model_item_script_path_temp=arg_params['model_item_script_path'],
)
# 创建目录
os.makedirs(os.path.join(arg_params['model_item_script_path'], model_item, device_num), exist_ok=True)
with open(os.path.join(arg_params['model_item_script_path'], model_item, device_num, f'{model_item}_bs{base_batch_size}_{fp_item}_{run_mode}.sh'), 'w') as f:
f.write(test_case)


if __name__ == "__main__":
# 获取传入的参数
frame = sys.argv[1]
config_path = sys.argv[2]
#mode 读取 YAML 文件中的参数
with open(config_path, 'r') as file:
arg_params = yaml.safe_load(file)
if frame == 'paddle':
generate_paddle_case(arg_params)
else:
generate_pytorch_case(arg_params)

16 changes: 16 additions & 0 deletions distributed/benchmark_scripts/generate/params.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
model_item: gpt_auto
global_batch_size: 16
fp_item: fp16O2
run_mode: DP2-MP2-PP2-SD2-stage3
device_num: N1C8
micro_batch_size: 4
dp_degree: 2
mp_degree: 2
pp_degree: 2
sharding_degree: 2
sharding_stage: 3
level: o2
local_batch_size: 8
workerlog_id: 2
benchmark_path: ./model_zoo/gpt-3/benchmarks/ # benchmark脚本存放的目录,一般为套件根目录; 比如./tests
model_item_script_path: ./test_tipc/gpt/static/auto_parallel # model_item脚本存放的目录; 比如./test_tipc/dygraph/hybrid_parallelism/gpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env bash

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 以下为run_benchmark.sh的必选参数
param="model_item=gpt_auto "
param+="global_batch_size=16 "
param+="fp_item=fp16O2 "
param+="run_mode=DP1-MP1-PP8-SD1-stage1 "
param+="device_num=N1C8 "
param+="micro_batch_size=2 "
# 以下为run_benchmark.sh的可选参数
param+="dp_degree=1 "
param+="mp_degree=1 "
param+="pp_degree=8 "
param+="sharding_degree=1 "
param+="sharding_stage=1 "
param+="level=o2 "
param+="local_batch_size=16 "
param+="workerlog_id=7 "

cd ./benchmarks
bash ./test_tipc/gpt/static/auto_parallel/benchmark_common/prepare.sh
# run
bash -c "${param} bash ./test_tipc/gpt/static/auto_parallel/benchmark_common/run_benchmark.sh"
22 changes: 22 additions & 0 deletions distributed/benchmark_scripts/gpt_auto/benchmark_common/prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python -m pip install -r ../requirements.txt
# get data
cd ../ || return
rm -rf data
mkdir data
wget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
wget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz
Loading