diff --git a/llm_rl/example_ppo.sh b/llm_rl/example_ppo.sh
index 0e609310..ef69912e 100644
--- a/llm_rl/example_ppo.sh
+++ b/llm_rl/example_ppo.sh
@@ -1,15 +1,18 @@
 CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
     --stage ppo \
-    --model_name_or_path path_to_llama_model \
+    --model_name_or_path mistralai/Mistral-7B-v0.1 \
     --do_train \
     --dataset alpaca_gpt4_en \
     --template default \
     --finetuning_type lora \
     --lora_target q_proj,v_proj \
     --resume_lora_training False \
-    --checkpoint_dir path_to_sft_checkpoint \
-    --reward_model path_to_rm_checkpoint \
-    --output_dir path_to_ppo_checkpoint \
+    --checkpoint_dir /workspace/sotopia-llm/llm_rl/mistral-7b-sft_cache/checkpoint-10 \
+    --reward_model /workspace/sotopia-llm/llm_rl/mistral-7b-rm_cache/checkpoint-10 \
+    --cache_dir ./model_cache \
+    --overwrite_cache \
+    --output_dir ./mistral-7b-ppo_cache \
+    --overwrite_output_dir \
     --per_device_train_batch_size 2 \
     --gradient_accumulation_steps 4 \
     --lr_scheduler_type cosine \
@@ -18,4 +21,5 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
     --learning_rate 1e-5 \
     --num_train_epochs 1.0 \
     --plot_loss \
-    --fp16
\ No newline at end of file
+    --bf16 
+
diff --git a/llm_rl/example_reward_model.sh b/llm_rl/example_rm.sh
similarity index 57%
rename from llm_rl/example_reward_model.sh
rename to llm_rl/example_rm.sh
index 873b07a5..c2295018 100644
--- a/llm_rl/example_reward_model.sh
+++ b/llm_rl/example_rm.sh
@@ -1,23 +1,27 @@
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 deepspeed src/train_bash.py \
     --stage rm \
-    --model_name_or_path meta-llama/Llama-2-13b-hf \
+    --model_name_or_path mistralai/Mistral-7B-v0.1 \
     --do_train \
     --dataset comparison_gpt4_en \
     --template default \
     --finetuning_type lora \
     --lora_target q_proj,v_proj \
     --resume_lora_training False \
-    --output_dir ./llama-2-13b-rm_cache \
-    --per_device_train_batch_size 8 \
+    --per_device_train_batch_size 1 \
     --gradient_accumulation_steps 8 \
     --lr_scheduler_type cosine \
     --logging_steps 10 \
-    --save_steps 1000 \
+    --save_steps 10 \
     --learning_rate 1e-6 \
     --num_train_epochs 1.0 \
     --plot_loss \
-    --fp16 \
+    --bf16 \
+    --cache_dir ./model_cache \
+    --overwrite_cache \
+    --output_dir ./mistral-7b-rm_cache \
+    --overwrite_output_dir \
     --use_auth_token True \
     --wandb_token "99caa13ec9552adf0e92e5c30021307ce3cf7fa4" \
     --hf_auth_token "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" \
-    --deepspeed ./deepspeed_config_s2.json 
+    --deepspeed ./deepspeed_config_s2.json \
+    --gradient_checkpointing True
diff --git a/llm_rl/example_sft.sh b/llm_rl/example_sft.sh
new file mode 100644
index 00000000..e8656af3
--- /dev/null
+++ b/llm_rl/example_sft.sh
@@ -0,0 +1,24 @@
+CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
+    --stage sft \
+    --model_name_or_path mistralai/Mistral-7B-v0.1 \
+    --do_train \
+    --dataset alpaca_gpt4_en \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --resume_lora_training False \
+    --cache_dir ./model_cache \
+    --overwrite_cache \
+    --output_dir ./mistral-7b-sft_cache \
+    --overwrite_output_dir \
+    --per_device_train_batch_size 4 \
+    --gradient_accumulation_steps 4 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --save_steps 1000 \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --plot_loss \
+    --bf16 \
+    --gradient_checkpointing True \
+    --save_steps 10
\ No newline at end of file