-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_best_of_N.eval_mode.sh
175 lines (133 loc) · 10.3 KB
/
run_best_of_N.eval_mode.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
GPU=2
batch_size=10000
cal_reward=true
path_to_gen_self_feedback=./prompts/prompt_generate_self_feedback.v3.txt
path_to_revise_w_feedback=./prompts/prompt_revise_response_w_feedback.v3.txt
path_to_revise_wo_feedback=./prompts/prompt_revise_response_wo_feedback.txt
data_source=alpaca-GPT4-dev-100
prefer_type=common #preference_1 preference_2
if [ $data_source == 'alpaca-GPT4-dev-100' ]; then
input=./data/exp_sampling/alpaca_gpt4.dev_set.num=100.w_preference_by_gpt-3.5.jsonl
fi
path_to_discriminator=./UltraRM-13b
for path_to_base_model in vicuna-13b-v1.5 WizardLM-13B-V1.2 Mistral-7B-Instruct-v0.1 zephyr-7b-beta Xwin-LM-13B-V0.2 tulu-2-dpo-13b tulu-2-dpo-7b ; do
root_to_save=./${data_source}.prefer=${prefer_type}.${path_to_base_model}
mkdir $root_to_save
max_tokens=2048
for mode in sample_N sample_N_wo_prefer tree_search tree_search_wo_feedback ; do
for n_sample in 128 ; do
if [ $path_to_base_model == 'vicuna-13b-v1.5' ]; then
base_model=./models/vicuna-13b-v1.5
prompt_template=./prompts/vicuna.json
elif [ $path_to_base_model == 'WizardLM-13B-V1.2' ]; then
base_model=./models/WizardLM-13B-V1.2
prompt_template=./prompts/vicuna.json
elif [ $path_to_base_model == 'Mistral-7B-Instruct-v0.2' ]; then
base_model=./models/Mistral-7B-Instruct-v0.2
prompt_template=./prompts/mistral-instruct.json
elif [ $path_to_base_model == 'zephyr-7b-beta' ]; then
base_model=./models/zephyr-7b-beta
prompt_template=./prompts/zephyr.json
elif [ $path_to_base_model == 'Xwin-LM-13B-V0.2' ]; then
base_model=./models/Xwin-LM-13B-V0.2
prompt_template=./prompts/vicuna.json
elif [ $path_to_base_model == 'tulu-2-dpo-13b' ]; then
base_model=./models/tulu-2-dpo-13b
prompt_template=./prompts/tulu-2.json
elif [ $path_to_base_model == 'tulu-2-dpo-7b' ]; then
base_model=./models/tulu-2-dpo-7b
prompt_template=./prompts/tulu-2.json
elif [ $path_to_base_model == 'Mistral-7B-Instruct-v0.1' ]; then
base_model=./models/llama2/Mistral-7B-Instruct-v0.1
prompt_template=./prompts/mistral-instruct.json
fi
if [ $mode == 'self_reflection' ]; then
output=./$root_to_save/responses.mode=${mode}.prompt=v2.N=${n_sample}.jsonl
CUDA_VISIBLE_DEVICES=$GPU python ./code/generate_self_reflection.py --base_model $base_model --prompt_template $prompt_template --input $input --output $output --path_to_gen_self_feedback $path_to_gen_self_feedback --path_to_revise_w_feedback $path_to_revise_w_feedback --mode $mode --n_sample $n_sample --batch_size $batch_size --max_tokens $max_tokens
if [ $cal_reward == true ]; then
input=./$root_to_save/responses.mode=${mode}.prompt=v2.N=${n_sample}.jsonl
output=./$root_to_save/reward.responses.mode=${mode}.prompt=v2.N=${n_sample}.jsonl
log=./$root_to_save/out.responses.mode=${mode}.prompt=v2.N=${n_sample}
CUDA_VISIBLE_DEVICES=$GPU python ./code/generate_reward_from_ultraFeedback.py --input $input --model $path_to_discriminator --output $output > $log
fi
elif [ $mode == 'sample_N' ]; then
if [ $prefer_type == 'common' ]; then
path_user_preference=./user_preference.txt
elif [ $prefer_type == 'preference_1' ]; then
path_user_preference=./user_preference_1.txt
elif [ $prefer_type == 'preference_2' ]; then
path_user_preference=./user_preference_2.txt
fi
output=./$root_to_save/responses.mode=${mode}.N=${n_sample}.jsonl
CUDA_VISIBLE_DEVICES=$GPU python ./code/generate_self_reflection.py --base_model $base_model --prompt_template $prompt_template --input $input --output $output --path_to_gen_self_feedback $path_to_gen_self_feedback --path_to_revise_w_feedback $path_to_revise_w_feedback --mode $mode --n_sample $n_sample --path_user_preference $path_user_preference --batch_size $batch_size --max_tokens $max_tokens
if [ $cal_reward == true ]; then
input=./$root_to_save/responses.mode=${mode}.N=${n_sample}.jsonl
output=./$root_to_save/reward.responses.mode=${mode}.N=${n_sample}.jsonl
log=./$root_to_save/out.responses.mode=${mode}.N=${n_sample}
CUDA_VISIBLE_DEVICES=$GPU python ./code/generate_reward_from_ultraFeedback.py --input $input --model $path_to_discriminator --output $output --path_user_preference $path_user_preference > $log
fi
elif [ $mode == 'sample_N_wo_prefer' ]; then
output=./$root_to_save/responses.mode=${mode}.N=${n_sample}.jsonl
CUDA_VISIBLE_DEVICES=$GPU python ./code/generate_self_reflection.py --base_model $base_model --prompt_template $prompt_template --input $input --output $output --path_to_gen_self_feedback $path_to_gen_self_feedback --path_to_revise_w_feedback $path_to_revise_w_feedback --mode 'sample_N' --n_sample $n_sample --batch_size $batch_size --max_tokens $max_tokens
if [ $cal_reward == true ]; then
input=./$root_to_save/responses.mode=${mode}.N=${n_sample}.jsonl
output=./$root_to_save/reward.responses.mode=${mode}.N=${n_sample}.jsonl
log=./$root_to_save/out.responses.mode=${mode}.N=${n_sample}
CUDA_VISIBLE_DEVICES=$GPU python ./code/generate_reward_from_ultraFeedback.py --input $input --model $path_to_discriminator --output $output > $log
fi
elif [ $mode == 'tree_search' ]; then
n_sample=$(expr $n_sample / 2)
## phase 1: sample N / 2 responses
if [ $prefer_type == 'common' ]; then
path_user_preference=./user_preference.txt
elif [ $prefer_type == 'preference_1' ]; then
path_user_preference=./user_preference_1.txt
elif [ $prefer_type == 'preference_2' ]; then
path_user_preference=./user_preference_2.txt
fi
output=./$root_to_save/responses.mode=${mode}.stage=random_sampling.N=${n_sample}.jsonl
CUDA_VISIBLE_DEVICES=$GPU python ./code/generate_self_reflection.py --base_model $base_model --prompt_template $prompt_template --input $input --output $output --path_to_gen_self_feedback $path_to_gen_self_feedback --path_to_revise_w_feedback $path_to_revise_w_feedback --mode 'sample_N' --n_sample $n_sample --path_user_preference $path_user_preference --batch_size $batch_size --max_tokens $max_tokens
## phase 2: obtain the best response from phase 1
input=./$root_to_save/responses.mode=${mode}.stage=random_sampling.N=${n_sample}.jsonl
output=./$root_to_save/reward.responses.mode=${mode}.stage=random_sampling.N=${n_sample}.jsonl
CUDA_VISIBLE_DEVICES=$GPU python ./code/generate_reward_from_ultraFeedback.py --input $input --model $path_to_discriminator --output $output --path_user_preference $path_user_preference
## phase 3: conduct self reflection
input=./$root_to_save/reward.responses.mode=${mode}.stage=random_sampling.N=${n_sample}.jsonl
output=./$root_to_save/responses.mode=${mode}.stage=self_reflection.N=${n_sample}.jsonl
CUDA_VISIBLE_DEVICES=$GPU python ./code/generate_self_reflection.py --base_model $base_model --prompt_template $prompt_template --input $input --output $output --path_to_gen_self_feedback $path_to_gen_self_feedback --path_to_revise_w_feedback $path_to_revise_w_feedback --mode 'tree_search' --n_sample $n_sample --path_user_preference $path_user_preference --batch_size $batch_size --max_tokens $max_tokens
if [ $cal_reward == true ]; then
input=./$root_to_save/responses.mode=${mode}.stage=self_reflection.N=${n_sample}.jsonl
output=./$root_to_save/reward.responses.mode=${mode}.stage=self_reflection.N=${n_sample}.jsonl
log=./$root_to_save/out.responses.mode=${mode}.stage=self_reflection.N=${n_sample}
CUDA_VISIBLE_DEVICES=$GPU python ./code/generate_reward_from_ultraFeedback.py --input $input --model $path_to_discriminator --output $output --path_user_preference $path_user_preference > $log
fi
elif [ $mode == 'tree_search_wo_feedback' ]; then
n_sample=$(expr $n_sample / 2)
## phase 1: sample N / 2 responses
if [ $prefer_type == 'common' ]; then
path_user_preference=./user_preference.txt
elif [ $prefer_type == 'preference_1' ]; then
path_user_preference=./user_preference_1.txt
elif [ $prefer_type == 'preference_2' ]; then
path_user_preference=./user_preference_2.txt
fi
output=./$root_to_save/responses.mode=${mode}.stage=random_sampling.N=${n_sample}.jsonl
CUDA_VISIBLE_DEVICES=$GPU python ./code/generate_self_reflection.py --base_model $base_model --prompt_template $prompt_template --input $input --output $output --path_to_gen_self_feedback $path_to_gen_self_feedback --path_to_revise_w_feedback $path_to_revise_w_feedback --mode 'sample_N' --n_sample $n_sample --path_user_preference $path_user_preference --batch_size $batch_size --max_tokens $max_tokens
## phase 2: obtain the best response from phase 1
input=./$root_to_save/responses.mode=${mode}.stage=random_sampling.N=${n_sample}.jsonl
output=./$root_to_save/reward.responses.mode=${mode}.stage=random_sampling.N=${n_sample}.jsonl
CUDA_VISIBLE_DEVICES=$GPU python ./code/generate_reward_from_ultraFeedback.py --input $input --model $path_to_discriminator --output $output --path_user_preference $path_user_preference
## phase 3: conduct self reflection
input=./$root_to_save/reward.responses.mode=${mode}.stage=random_sampling.N=${n_sample}.jsonl
output=./$root_to_save/responses.mode=${mode}.stage=self_reflection.N=${n_sample}.jsonl
CUDA_VISIBLE_DEVICES=$GPU python ./code/generate_self_reflection.py --base_model $base_model --prompt_template $prompt_template --input $input --output $output --path_to_gen_self_feedback $path_to_gen_self_feedback --path_to_revise_w_feedback $path_to_revise_w_feedback --mode 'tree_search_wo_feedback' --n_sample $n_sample --path_user_preference $path_user_preference --batch_size $batch_size --max_tokens $max_tokens --path_to_revise_wo_feedback $path_to_revise_wo_feedback
if [ $cal_reward == true ]; then
input=./$root_to_save/responses.mode=${mode}.stage=self_reflection.N=${n_sample}.jsonl
output=./$root_to_save/reward.responses.mode=${mode}.stage=self_reflection.N=${n_sample}.jsonl
log=./$root_to_save/out.responses.mode=${mode}.stage=self_reflection.N=${n_sample}
CUDA_VISIBLE_DEVICES=$GPU python ./code/generate_reward_from_ultraFeedback.py --input $input --model $path_to_discriminator --output $output --path_user_preference $path_user_preference > $log
fi
fi
done;
done;
done;