forked from flowersteam/Grounding_LLMs_with_online_RL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_results.py
61 lines (55 loc) · 5.39 KB
/
test_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def print_test_results():
root = ''
list_dir = os.listdir(root)
for test_name in ['no_modification_test', 'other_name_same_categories', 'adj_synonym', 'no_meaning_nouns',
'no_meaning_adj', 'no_meaning_words', 'change_intro_first_personne_speaker',
'change_intro_first_personne_agent']:
print('NAME TESTS: {}'.format(test_name))
reward_list = []
# for model_name in ['.*llm_mtrl_nbr_env_32_Flan_T5large_pretrained_True_nbr_actions_6_turn_left_turn_right_go_forward_pick_up_drop_toggle_shape_reward_beta_0.*']:
# for model_name in ['.*llm_gtl_nbr_env_32_Flan_T5large_pretrained_True_nbr_actions_6_turn_left_turn_right_go_forward_pick_up_drop_toggle_shape_reward_beta_0.*']:
# for model_name in ['.*drrn_mtrl_nbr_env_32_DRRN_pretrained_True_nbr_actions_6_turn_left_turn_right_go_forward_pick_up_drop_toggle_shape_reward_beta_0.*']:
for model_name in ['.*drrn_gtl_nbr_env_32_DRRN_pretrained_True_nbr_actions_6_turn_left_turn_right_go_forward_pick_up_drop_toggle_shape_reward_beta_0_seed_1.*']:
for directory in list_dir:
if re.match(model_name, directory):
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-MixtTrainLocal-v0'+'/return_per_episode/'+test_name + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-MixtTrainLocal-v0'+'/return_per_episode/'+test_name + '_zero_shot' + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-MixtTestLocal-v0'+'/return_per_episode/'+test_name + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-MixtTestLocal-v0'+'/return_per_episode/'+test_name + '_zero_shot' + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-MixtTrainLocal-v0'+'/return_per_episode/'+test_name + 'shift_left_shift_right_go_ahead_take_release_turn' + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-MixtTrainLocal-v0'+'/return_per_episode/'+test_name + '_shift_left_shift_right_go_ahead_take_release_turn' + '_zero_shot' + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-MixtTrainLocal-v0'+'/return_per_episode/'+test_name + '_rotate_left_rotate_right_move_ahead_take_release_switch' + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-MixtTrainLocal-v0'+'/return_per_episode/'+test_name + '_rotate_left_rotate_right_move_ahead_take_release_switch' + '_zero_shot' + '.npy'))
#reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-PickUpSeqPickUpLocal-v0'+'/return_per_episode/'+test_name + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-PickUpSeqPickUpLocal-v0'+'/return_per_episode/'+test_name + '_zero_shot' + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-PickUpSeqGoToLocal-v0'+'/return_per_episode/'+test_name + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-PickUpSeqGoToLocal-v0'+'/return_per_episode/'+test_name + '_zero_shot' + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-GoToAfterPickUpLocal-v0'+'/return_per_episode/'+test_name + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-GoToAfterPickUpLocal-v0'+'/return_per_episode/'+test_name + '_zero_shot' + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-PickUpThenGoToLocal-v0'+'/return_per_episode/'+test_name + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-PickUpThenGoToLocal-v0'+'/return_per_episode/'+test_name + '_zero_shot' + '.npy'))
reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-GoToFrench-v0'+'/return_per_episode/'+test_name + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-GoToFrench-v0'+'/return_per_episode/'+test_name + '_zero_shot' + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-GoToLocal-v0'+'/return_per_episode/'+test_name + '.npy'))
# reward_list.append(np.load(root+'/'+directory+'/test'+'/BabyAI-GoToLocal-v0'+'/return_per_episode/'+test_name + '_zero_shot' + '.npy'))
reward_array = np.concatenate(reward_list)
succes_traj = [(r > 0).astype(int) for r in reward_list]
# sr_array = np.array([np.mean(st) for st in succes_traj])
sr_array = []
bootstrapping = 1 # 1 no bootstrapping
for st in succes_traj:
for i in range(bootstrapping):
sr_array.append(np.mean(st[i:int((i+1)*(len(st)/bootstrapping))]))
sr_array = np.array(sr_array)
"""plt.hist(reward_array, bins=100)
plt.title(test_name)
plt.show()"""
z_p = 2.575829303549
print("For {} the mean return per episode is {} +- {}".format(test_name, np.mean(reward_array), np.std(reward_array)))
print("For {} the mean success rate per episode is {} +- {}".format(test_name, np.mean(sr_array), z_p*np.std(sr_array, ddof=1)/np.sqrt(len(sr_array))))
print_test_results()