diff --git a/games/cartpole-continuous.py b/games/cartpole-continuous.py index 9245cd51..fd793f29 100644 --- a/games/cartpole-continuous.py +++ b/games/cartpole-continuous.py @@ -13,11 +13,11 @@ class MuZeroConfig: def __init__(self): self.seed = 0 # Seed for numpy, torch and the game - - ### Game self.observation_shape = (1, 1, 4) # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array) - self.action_space = [i for i in range(2)] # Fixed list of all possible actions. You should only edit the length + numJoints = 1 + maxSteps = 50 + self.action_space = numpy.ones(numJoints) # Fixed list of all possible actions. You should only edit the length self.players = [i for i in range(1)] # List of players. You should only edit the length self.stacked_observations = 0 # Number of previous observations and previous actions to add to the current observation @@ -26,7 +26,6 @@ def __init__(self): self.opponent = None # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class - ### Self-Play self.num_actors = 1 # Number of simultaneous threads self-playing to feed the replay buffer self.max_moves = 500 # Maximum number of moves if game is not finished before @@ -42,6 +41,10 @@ def __init__(self): self.pb_c_base = 19652 self.pb_c_init = 1.25 + #Progressive widening + self.progressive_widening_C_pw = 1 + self.progressive_widening_a = 0.49 + ### Network @@ -105,6 +108,8 @@ def __init__(self): self.training_delay = 0 # Number of seconds to wait after each training step self.ratio = 1/2 # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it + self.log_video = False + self.video_iter = 1000 def visit_softmax_temperature_fn(self, trained_steps): """ @@ -142,9 +147,9 @@ def step(self, action): Returns: The new observation, the reward and a boolean if the game has ended. """ - action = -1 if action < -1 else action - action = 1 if action > 1 else action - observation, reward, done, _ = self.env.step(action) + action = [-1] if action[0] < -1 else action + action = [1] if action[0] > 1 else action + observation, reward, done, _ = self.env.step(action[0]) return numpy.array([[observation]]), reward, done def legal_actions(self): @@ -175,12 +180,14 @@ def close(self): """ self.env.close() - def render(self): + def render(self, mode='human'): """ Display the game observation. """ - self.env.render() - input("Press enter to take a step ") + ret = self.env.render(mode) + if mode == 'human': + input("Press enter to take a step ") + return ret class ContinuousCartPoleEnv(gym.Env): diff --git a/games/halfcheetah.py b/games/halfcheetah.py new file mode 100644 index 00000000..9f87e0c8 --- /dev/null +++ b/games/halfcheetah.py @@ -0,0 +1,178 @@ +import datetime +import math +import os + +import gym +import numpy +import torch + +from .abstract_game import AbstractGame +from gym.envs.mujoco.half_cheetah import HalfCheetahEnv + + +class MuZeroConfig: + def __init__(self): + self.seed = 0 # Seed for numpy, torch and the game + + ### Game + self.observation_shape = (1, 1, + 17) # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array) + numJoints=6 + self.action_space = [-numpy.ones(numJoints), numpy.ones(numJoints)] # Fixed list of all possible actions. You should only edit the length + self.players = [i for i in range(1)] # List of players. You should only edit the length + self.stacked_observations = 0 # Number of previous observations and previous actions to add to the current observation + + # Evaluate + self.muzero_player = 0 # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second) + self.opponent = None # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class + + ### Self-Play + self.num_actors = 1 # Number of simultaneous threads self-playing to feed the replay buffer + self.max_moves = 50 # Maximum number of moves if game is not finished before + self.num_simulations = self.max_moves # Number of future moves self-simulated + self.discount = 0.997 # Chronological discount of the reward + self.temperature_threshold = None # Number of moves before dropping temperature to 0 (ie playing according to the max) + + # Root prior exploration noise + self.root_dirichlet_alpha = 0.25 + self.root_exploration_fraction = 0.25 + + # UCB formula + self.pb_c_base = 19652 + self.pb_c_init = 1.25 + + ### Network + self.network = "fullyconnected" # "resnet" / "fullyconnected" + self.support_size = 10 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size + + # Residual Network + self.downsample = False # Downsample observations before representation network (See paper appendix Network Architecture) + self.blocks = 1 # Number of blocks in the ResNet + self.channels = 2 # Number of channels in the ResNet + self.reduced_channels = 2 # Number of channels before heads of dynamic and prediction networks + self.resnet_fc_reward_layers = [] # Define the hidden layers in the reward head of the dynamic network + self.resnet_fc_value_layers = [] # Define the hidden layers in the value head of the prediction network + self.resnet_fc_policy_layers = [] # Define the hidden layers in the policy head of the prediction network + + # Fully Connected Network + self.encoding_size = 8 + self.fc_representation_layers = [] # Define the hidden layers in the representation network + self.fc_dynamics_layers = [16] # Define the hidden layers in the dynamics network + self.fc_reward_layers = [16] # Define the hidden layers in the reward network + self.fc_value_layers = [] # Define the hidden layers in the value network + self.fc_policy_layers = [] # Define the hidden layers in the policy network + + ### Training + self.results_path = os.path.join(os.path.dirname(__file__), "../results", os.path.basename(__file__)[:-3], + datetime.datetime.now().strftime( + "%Y-%m-%d--%H-%M-%S")) # Path to store the model weights and TensorBoard logs + self.training_steps = 5000 # Total number of training steps (ie weights update according to a batch) + self.batch_size = 128 # Number of parts of games to train on at each training step + self.checkpoint_interval = 10 # Number of training steps before using the model for sef-playing + self.value_loss_weight = 1 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze) + self.training_device = "cuda" if torch.cuda.is_available() else "cpu" # Train on GPU if available + + self.optimizer = "Adam" # "Adam" or "SGD". Paper uses SGD + self.weight_decay = 1e-4 # L2 weights regularization + self.momentum = 0.9 # Used only if optimizer is SGD + + # Exponential learning rate schedule + self.lr_init = 0.05 # Initial learning rate + self.lr_decay_rate = 1 # Set it to 1 to use a constant learning rate + self.lr_decay_steps = 1000 + + ### Replay Buffer + self.window_size = 500 # Number of self-play games to keep in the replay buffer + self.num_unroll_steps = 10 # Number of game moves to keep for every batch element + self.td_steps = self.max_moves # Number of steps in the future to take into account for calculating the target value + self.use_last_model_value = True # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze) + + # Prioritized Replay (See paper appendix Training) + self.PER = True # Select in priority the elements in the replay buffer which are unexpected for the network + self.use_max_priority = True # Use the n-step TD error as initial priority. Better for large replay buffer + self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1 + self.PER_beta = 1.0 + + ### Adjust the self play / training ratio to avoid over/underfitting + self.self_play_delay = 0 # Number of seconds to wait after each played game + self.training_delay = 0 # Number of seconds to wait after each training step + self.ratio = 1 / 2 # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it + + def visit_softmax_temperature_fn(self, trained_steps): + """ + Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses. + The smaller it is, the more likely the best action (ie with the highest visit count) is chosen. + + Returns: + Positive float. + """ + if trained_steps < 0.5 * self.training_steps: + return 1 + elif trained_steps < 0.75 * self.training_steps: + return 0.1 + else: + return 0.01 + + +class Game(AbstractGame): + """ + Game wrapper. + """ + + def __init__(self, seed=None): + self.env = HalfCheetahEnv() + if seed is not None: + self.env.seed(seed) + + def step(self, action): + """ + Apply action to the game. + + Args: + action : action of the action_space to take. + + Returns: + The new observation, the reward and a boolean if the game has ended. + """ + observation, reward, done, _ = self.env.step(action) + return numpy.array([[observation]]), reward, done + + def legal_actions(self): + """ + Should return the legal actions at each turn, if it is not available, it can return + the whole action space. At each turn, the game have to be able to handle one of returned actions. + + For complex game where calculating legal moves is too long, the idea is to define the legal actions + equal to the action space but to return a negative reward if the action is illegal. + + Returns: + An array of integers, subset of the action space. + """ + #return [i for i in range(2)] + numJoints = self.env.action_space.shape[0] + return [-numpy.ones(numJoints), numpy.ones(numJoints)] + + def reset(self): + """ + Reset the game for a new game. + + Returns: + Initial observation of the game. + """ + return numpy.array([[self.env.reset()]]) + + def close(self): + """ + Properly close the game. + """ + self.env.close() + + def render(self, mode='human'): + """ + Display the game observation. + """ + ret = self.env.render(mode) + if mode == 'human': + input("Press enter to take a step ") + return ret + diff --git a/games/lunarlandercontinuous.py b/games/lunarlandercontinuous.py new file mode 100644 index 00000000..3c94eabf --- /dev/null +++ b/games/lunarlandercontinuous.py @@ -0,0 +1,186 @@ +import datetime +import math +import os + +import gym +import numpy +import torch + +from .abstract_game import AbstractGame +from gym.envs.box2d.lunar_lander import LunarLanderContinuous + +class MuZeroConfig: + def __init__(self): + self.seed = 0 # Seed for numpy, torch and the game + + ### Game + self.observation_shape = (1, 1, + 8) # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array) + numJoints = 2 + maxSteps = 700 + + self.action_space = numpy.ones(numJoints) #models.py just does len of this variable to determine the action space #range of action space doesn't matter because gaussian network will create actions without regard to limits # Fixed list of all possible actions. You should only edit the length + self.players = [i for i in range(1)] # List of players. You should only edit the length + self.stacked_observations = 0 # Number of previous observations and previous actions to add to the current observation + + # Evaluate + self.muzero_player = 0 # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second) + self.opponent = None # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class + + ### Self-Play + self.num_actors = 1 # Number of simultaneous threads self-playing to feed the replay buffer + self.max_moves = maxSteps # Maximum number of moves if game is not finished before + self.num_simulations = 100 # Number of future moves self-simulated + self.discount = 0.997 # Chronological discount of the reward + self.temperature_threshold = None # Number of moves before dropping temperature to 0 (ie playing according to the max) + + # Root prior exploration noise + self.root_dirichlet_alpha = 0.25 + self.root_exploration_fraction = 0.25 + + # UCB formula + self.pb_c_base = 19652 + self.pb_c_init = 1.25 + + #Progressive widening + self.progressive_widening_C_pw = 1 + self.progressive_widening_a = 0.49 + + ### Network + self.network = "fullyconnected" # "resnet" / "fullyconnected" + self.support_size = 35 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size + + # Residual Network + self.downsample = False # Downsample observations before representation network (See paper appendix Network Architecture) + self.blocks = 1 # Number of blocks in the ResNet + self.channels = 2 # Number of channels in the ResNet + self.reduced_channels = 2 # Number of channels before heads of dynamic and prediction networks + self.resnet_fc_reward_layers = [] # Define the hidden layers in the reward head of the dynamic network + self.resnet_fc_value_layers = [] # Define the hidden layers in the value head of the prediction network + self.resnet_fc_policy_layers = [] # Define the hidden layers in the policy head of the prediction network + + # Fully Connected Network + self.encoding_size = 64 + self.fc_representation_layers = [] # Define the hidden layers in the representation network + self.fc_dynamics_layers = [400,300] # Define the hidden layers in the dynamics network + self.fc_reward_layers = [400,300] # Define the hidden layers in the reward network + self.fc_value_layers = [400,300] # Define the hidden layers in the value network + self.fc_policy_layers = [400,300] # Define the hidden layers in the policy network + + ### Training + self.results_path = os.path.join(os.path.dirname(__file__), "../results", os.path.basename(__file__)[:-3], + datetime.datetime.now().strftime( + "%Y-%m-%d--%H-%M-%S")) # Path to store the model weights and TensorBoard logs + self.training_steps = 500000 # Total number of training steps (ie weights update according to a batch) + self.batch_size = 128 # Number of parts of games to train on at each training step + self.checkpoint_interval = 10 # Number of training steps before using the model for sef-playing + self.value_loss_weight = 1 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze) + self.training_device = "cuda" if torch.cuda.is_available() else "cpu" # Train on GPU if available + + self.optimizer = "Adam" # "Adam" or "SGD". Paper uses SGD + self.weight_decay = 1e-4 # L2 weights regularization + self.momentum = 0.9 # Used only if optimizer is SGD + + # Exponential learning rate schedule + self.lr_init = 0.001 # Initial learning rate + self.lr_decay_rate = 1 # Set it to 1 to use a constant learning rate + self.lr_decay_steps = 1000 + + ### Replay Buffer + self.window_size = 1000 # Number of self-play games to keep in the replay buffer + self.num_unroll_steps = 100 # Number of game moves to keep for every batch element + self.td_steps = 200 # Number of steps in the future to take into account for calculating the target value + self.use_last_model_value = True # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze) + + # Prioritized Replay (See paper appendix Training) + self.PER = True # Select in priority the elements in the replay buffer which are unexpected for the network + self.use_max_priority = True # Use the n-step TD error as initial priority. Better for large replay buffer + self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1 + self.PER_beta = 1.0 + + ### Adjust the self play / training ratio to avoid over/underfitting + self.self_play_delay = 0 # Number of seconds to wait after each played game + self.training_delay = 0 # Number of seconds to wait after each training step + self.ratio = 1 / 5 # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it + + self.log_video = False + self.video_iter = 1000 + + def visit_softmax_temperature_fn(self, trained_steps): + """ + Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses. + The smaller it is, the more likely the best action (ie with the highest visit count) is chosen. + + Returns: + Positive float. + """ + if trained_steps < 0.5 * self.training_steps: + return 1 + elif trained_steps < 0.75 * self.training_steps: + return 0.1 + else: + return 0.01 + + +class Game(AbstractGame): + """ + Game wrapper. + """ + + def __init__(self, seed=None): + self.env = LunarLanderContinuous() + if seed is not None: + self.env.seed(seed) + + def step(self, action): + """ + Apply action to the game. + + Args: + action : action of the action_space to take. + + Returns: + The new observation, the reward and a boolean if the game has ended. + """ + observation, reward, done, _ = self.env.step(action) + return numpy.array([[observation]]), reward, done + + def legal_actions(self): + """ + Should return the legal actions at each turn, if it is not available, it can return + the whole action space. At each turn, the game have to be able to handle one of returned actions. + + For complex game where calculating legal moves is too long, the idea is to define the legal actions + equal to the action space but to return a negative reward if the action is illegal. + + Returns: + An array of integers, subset of the action space. + """ + #return [i for i in range(2)] + numJoints = self.env.action_space.shape[0] + return [-numpy.ones(numJoints), numpy.ones(numJoints)] + + def reset(self): + """ + Reset the game for a new game. + + Returns: + Initial observation of the game. + """ + return numpy.array([[self.env.reset()]]) + + def close(self): + """ + Properly close the game. + """ + self.env.close() + + def render(self, mode='human'): + """ + Display the game observation. + """ + ret = self.env.render(mode) + if mode == 'human': + input("Press enter to take a step ") + return ret + diff --git a/games/reacher.py b/games/reacher.py new file mode 100644 index 00000000..aaba20ab --- /dev/null +++ b/games/reacher.py @@ -0,0 +1,188 @@ +import datetime +import math +import os + +import gym +import numpy +import torch + +from .abstract_game import AbstractGame +from gym.envs.mujoco.reacher import ReacherEnv + +class MuZeroConfig: + def __init__(self): + self.seed = 0 # Seed for numpy, torch and the game + + ### Game + self.observation_shape = (1, 1, + 11) # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array) + numJoints=2 + maxSteps = 50 + self.action_space = numpy.ones(numJoints) #models.py just does len of this variable to determine the action space + # #range of action space doesn't matter because gaussian network will create actions without regard to limits # Fixed list of all possible actions. You should only edit the length + self.players = [i for i in range(1)] # List of players. You should only edit the length + self.stacked_observations = 0 # Number of previous observations and previous actions to add to the current observation + + # Evaluate + self.muzero_player = 0 # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second) + self.opponent = None # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class + + ### Self-Play + self.num_actors = 1 # Number of simultaneous threads self-playing to feed the replay buffer + self.max_moves = maxSteps # Maximum number of moves if game is not finished before + self.num_simulations = maxSteps # Number of future moves self-simulated + self.discount = 0.997 # Chronological discount of the reward + self.temperature_threshold = None # Number of moves before dropping temperature to 0 (ie playing according to the max) + + # Root prior exploration noise + self.root_dirichlet_alpha = 0.25 + self.root_exploration_fraction = 0.25 + + # UCB formula + self.pb_c_base = 19652 + self.pb_c_init = 1.25 + + #Progressive widening + self.progressive_widening_C_pw = 1 + self.progressive_widening_a = 0.49 + + ### Network + self.network = "fullyconnected" # "resnet" / "fullyconnected" + self.support_size = 13 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size + + # Residual Network + self.downsample = False # Downsample observations before representation network (See paper appendix Network Architecture) + self.blocks = 1 # Number of blocks in the ResNet + self.channels = 2 # Number of channels in the ResNet + self.reduced_channels = 2 # Number of channels before heads of dynamic and prediction networks + self.resnet_fc_reward_layers = [] # Define the hidden layers in the reward head of the dynamic network + self.resnet_fc_value_layers = [] # Define the hidden layers in the value head of the prediction network + self.resnet_fc_policy_layers = [] # Define the hidden layers in the policy head of the prediction network + + # Fully Connected Network + self.encoding_size = 64 + self.fc_representation_layers = [] # Define the hidden layers in the representation network + self.fc_dynamics_layers = [100] # Define the hidden layers in the dynamics network + self.fc_reward_layers = [100] # Define the hidden layers in the reward network + self.fc_value_layers = [100] # Define the hidden layers in the value network + self.fc_policy_layers = [100] # Define the hidden layers in the policy network + + ### Training + self.results_path = os.path.join(os.path.dirname(__file__), "../results", os.path.basename(__file__)[:-3], + datetime.datetime.now().strftime( + "%Y-%m-%d--%H-%M-%S")) # Path to store the model weights and TensorBoard logs + self.training_steps = 500000 # Total number of training steps (ie weights update according to a batch) + self.batch_size = 100 # Number of parts of games to train on at each training step + self.checkpoint_interval = 10 # Number of training steps before using the model for sef-playing + self.value_loss_weight = 1 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze) + self.training_device = "cuda" if torch.cuda.is_available() else "cpu" # Train on GPU if available + + self.optimizer = "Adam" # "Adam" or "SGD". Paper uses SGD + self.weight_decay = 1e-4 # L2 weights regularization + self.momentum = 0.9 # Used only if optimizer is SGD + + # Exponential learning rate schedule + self.lr_init = 0.001 # Initial learning rate + self.lr_decay_rate = 1 # Set it to 1 to use a constant learning rate + self.lr_decay_steps = 1000 + + ### Replay Buffer + self.window_size = 1000 # Number of self-play games to keep in the replay buffer + self.num_unroll_steps = 10 # Number of game moves to keep for every batch element + self.td_steps = 50 # Number of steps in the future to take into account for calculating the target value + self.use_last_model_value = True # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze) + + # Prioritized Replay (See paper appendix Training) + self.PER = True # Select in priority the elements in the replay buffer which are unexpected for the network + self.use_max_priority = True # Use the n-step TD error as initial priority. Better for large replay buffer + self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1 + self.PER_beta = 1.0 + + ### Adjust the self play / training ratio to avoid over/underfitting + self.self_play_delay = 0 # Number of seconds to wait after each played game + self.training_delay = 0 # Number of seconds to wait after each training step + self.ratio = 1 / 2 # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it + + self.log_video = True + self.video_iter = 500 + + def visit_softmax_temperature_fn(self, trained_steps): + """ + Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses. + The smaller it is, the more likely the best action (ie with the highest visit count) is chosen. + + Returns: + Positive float. + """ + if trained_steps < 0.5 * self.training_steps: + return 1 + elif trained_steps < 0.75 * self.training_steps: + return 0.1 + else: + return 0.01 + + +class Game(AbstractGame): + """ + Game wrapper. + """ + + def __init__(self, seed=None): + self.env = ReacherEnv() + if seed is not None: + self.env.seed(seed) + + def step(self, action): + """ + Apply action to the game. + + Args: + action : action of the action_space to take. + + Returns: + The new observation, the reward and a boolean if the game has ended. + """ + observation, reward, done, _ = self.env.step(action) + return numpy.array([[observation]]), reward, done + + def legal_actions(self): + #This is called in a few places in self_play but it does not affect the training + #if code is being re-published its probably best to remove this function to avoid confusion + """ + Should return the legal actions at each turn, if it is not available, it can return + the whole action space. At each turn, the game have to be able to handle one of returned actions. + + For complex game where calculating legal moves is too long, the idea is to define the legal actions + equal to the action space but to return a negative reward if the action is illegal. + + Returns: + An array of integers, subset of the action space. + """ + #return [i for i in range(2)] + numJoints = self.env.action_space.shape[0] + return [-numpy.ones(numJoints), numpy.ones(numJoints)] + + def reset(self): + """ + Reset the game for a new game. + + Returns: + Initial observation of the game. + """ + return numpy.array([[self.env.reset()]]) + + def close(self): + """ + Properly close the game. + """ + self.env.close() + + def render(self, mode='human'): + """ + Display the game observation. + """ + ret = self.env.render(mode) + if mode == 'human': + input("Press enter to take a step ") + return ret + diff --git a/games/sawyershelf.py b/games/sawyershelf.py new file mode 100644 index 00000000..69b8f1f9 --- /dev/null +++ b/games/sawyershelf.py @@ -0,0 +1,188 @@ +import datetime +import math +import os + +import gym +import numpy +import torch + +from .abstract_game import AbstractGame +from games.shelf.sawyer_shelf import SawyerPegShelfEnvMultitask + +class MuZeroConfig: + def __init__(self): + self.seed = 0 # Seed for numpy, torch and the game + + ### Game + self.observation_shape = (1, 1, + 24) # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array) + numJoints=9 + maxSteps = 40 + self.action_space = numpy.ones(numJoints) #models.py just does len of this variable to determine the action space #range of action space doesn't matter because gaussian network will + # create actions without regard to limits # Fixed list of all possible actions. You should only edit the length + self.players = [i for i in range(1)] # List of players. You should only edit the length + self.stacked_observations = 0 # Number of previous observations and previous actions to add to the current observation + + # Evaluate + self.muzero_player = 0 # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second) + self.opponent = None # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class + + ### Self-Play + self.num_actors = 1 # Number of simultaneous threads self-playing to feed the replay buffer + self.max_moves = maxSteps # Maximum number of moves if game is not finished before + self.num_simulations = maxSteps # Number of future moves self-simulated + self.discount = 0.997 # Chronological discount of the reward + self.temperature_threshold = None # Number of moves before dropping temperature to 0 (ie playing according to the max) + + # Root prior exploration noise + self.root_dirichlet_alpha = 0.25 + self.root_exploration_fraction = 0.25 + + # UCB formula + self.pb_c_base = 19652 + self.pb_c_init = 1.25 + + #Progressive widening + self.progressive_widening_C_pw = 1 + self.progressive_widening_a = 0.49 + + ### Network + self.network = "fullyconnected" # "resnet" / "fullyconnected" + self.support_size = 23 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size + + # Residual Network + self.downsample = False # Downsample observations before representation network (See paper appendix Network Architecture) + self.blocks = 1 # Number of blocks in the ResNet + self.channels = 2 # Number of channels in the ResNet + self.reduced_channels = 2 # Number of channels before heads of dynamic and prediction networks + self.resnet_fc_reward_layers = [] # Define the hidden layers in the reward head of the dynamic network + self.resnet_fc_value_layers = [] # Define the hidden layers in the value head of the prediction network + self.resnet_fc_policy_layers = [] # Define the hidden layers in the policy head of the prediction network + + # Fully Connected Network + self.encoding_size = 128 + self.fc_representation_layers = [] # Define the hidden layers in the representation network + self.fc_dynamics_layers = [256,256,256,256] # Define the hidden layers in the dynamics network + self.fc_reward_layers = [256,256,256,256] # Define the hidden layers in the reward network + self.fc_value_layers = [256,256,256,256] # Define the hidden layers in the value network + self.fc_policy_layers = [256,256,256,256] # Define the hidden layers in the policy network + + ### Training + self.results_path = os.path.join(os.path.dirname(__file__), "../results", os.path.basename(__file__)[:-3], + datetime.datetime.now().strftime( + "%Y-%m-%d--%H-%M-%S")) # Path to store the model weights and TensorBoard logs + self.training_steps = 500000 # Total number of training steps (ie weights update according to a batch) + self.batch_size = 128 # Number of parts of games to train on at each training step + self.checkpoint_interval = 10 # Number of training steps before using the model for sef-playing + self.value_loss_weight = 0.5 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze) + self.training_device = "cuda" if torch.cuda.is_available() else "cpu" # Train on GPU if available + + self.optimizer = "Adam" # "Adam" or "SGD". Paper uses SGD + self.weight_decay = 1e-4 # L2 weights regularization + self.momentum = 0.9 # Used only if optimizer is SGD + + # Exponential learning rate schedule + self.lr_init = 0.001 # Initial learning rate + self.lr_decay_rate = 1 # Set it to 1 to use a constant learning rate + self.lr_decay_steps = 1000 + + ### Replay Buffer + self.window_size = 5000 # Number of self-play games to keep in the replay buffer + self.num_unroll_steps = 10 # Number of game moves to keep for every batch element + self.td_steps = 20 # Number of steps in the future to take into account for calculating the target value + self.use_last_model_value = True # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze) + + # Prioritized Replay (See paper appendix Training) + self.PER = True # Select in priority the elements in the replay buffer which are unexpected for the network + self.use_max_priority = True # Use the n-step TD error as initial priority. Better for large replay buffer + self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1 + self.PER_beta = 1.0 + + ### Adjust the self play / training ratio to avoid over/underfitting + self.self_play_delay = 0 # Number of seconds to wait after each played game + self.training_delay = 0 # Number of seconds to wait after each training step + self.ratio = 1 / 2 # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it + + self.log_video = True + self.video_iter = 500 + + def visit_softmax_temperature_fn(self, trained_steps): + """ + Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses. + The smaller it is, the more likely the best action (ie with the highest visit count) is chosen. + + Returns: + Positive float. + """ + if trained_steps < 0.5 * self.training_steps: + return 1 + elif trained_steps < 0.75 * self.training_steps: + return 0.1 + else: + return 0.01 + + +class Game(AbstractGame): + """ + Game wrapper. + """ + + def __init__(self, seed=None): + sparse = False + self.env = SawyerPegShelfEnvMultitask(sparse=sparse, stepMinusOne=sparse) + if seed is not None: + self.env.seed(seed) + + def step(self, action): + """ + Apply action to the game. + + Args: + action : action of the action_space to take. + + Returns: + The new observation, the reward and a boolean if the game has ended. + """ + assert (len(action) == 9) + observation, reward, done, _ = self.env.step(action) + return numpy.array([[observation]]), reward, done + + def legal_actions(self): + """ + Should return the legal actions at each turn, if it is not available, it can return + the whole action space. At each turn, the game have to be able to handle one of returned actions. + + For complex game where calculating legal moves is too long, the idea is to define the legal actions + equal to the action space but to return a negative reward if the action is illegal. + + Returns: + An array of integers, subset of the action space. + """ + #return [i for i in range(2)] + numJoints = self.env.action_space.shape[0] + return [-numpy.ones(numJoints), numpy.ones(numJoints)] + + def reset(self): + """ + Reset the game for a new game. + + Returns: + Initial observation of the game. + """ + return numpy.array([[self.env.reset()]]) + + def close(self): + """ + Properly close the game. + """ + self.env.close() + + def render(self,mode='human'): + """ + Display the game observation. + """ + ret = self.env.render(mode) + if mode=='human': + input("Press enter to take a step ") + return ret + diff --git a/games/shelf/__init__.py b/games/shelf/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/games/shelf/assets/__init__.py b/games/shelf/assets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/games/shelf/assets/eval_10_mugs_sawyer_shelf_12_07_18_17_32.xml b/games/shelf/assets/eval_10_mugs_sawyer_shelf_12_07_18_17_32.xml new file mode 100644 index 00000000..ea1305c6 --- /dev/null +++ b/games/shelf/assets/eval_10_mugs_sawyer_shelf_12_07_18_17_32.xml @@ -0,0 +1,551 @@ + + + + + + + + diff --git a/games/shelf/assets/eval_10_mugs_sawyer_shelf_12_07_18_26_16.xml b/games/shelf/assets/eval_10_mugs_sawyer_shelf_12_07_18_26_16.xml new file mode 100644 index 00000000..ea1305c6 --- /dev/null +++ b/games/shelf/assets/eval_10_mugs_sawyer_shelf_12_07_18_26_16.xml @@ -0,0 +1,551 @@ + + + + + + + + diff --git a/games/shelf/assets/eval_10_mugs_sawyer_shelf_14_07_16_10_00.xml b/games/shelf/assets/eval_10_mugs_sawyer_shelf_14_07_16_10_00.xml new file mode 100644 index 00000000..ea1305c6 --- /dev/null +++ b/games/shelf/assets/eval_10_mugs_sawyer_shelf_14_07_16_10_00.xml @@ -0,0 +1,551 @@ + + + + + + + + diff --git a/games/shelf/assets/eval_10_mugs_sawyer_shelf_14_07_16_29_13.xml b/games/shelf/assets/eval_10_mugs_sawyer_shelf_14_07_16_29_13.xml new file mode 100644 index 00000000..ea1305c6 --- /dev/null +++ b/games/shelf/assets/eval_10_mugs_sawyer_shelf_14_07_16_29_13.xml @@ -0,0 +1,551 @@ + + + + + + + + diff --git a/games/shelf/assets/eval_10_mugs_sawyer_shelf_22_07_19_35_06.xml b/games/shelf/assets/eval_10_mugs_sawyer_shelf_22_07_19_35_06.xml new file mode 100644 index 00000000..ea1305c6 --- /dev/null +++ b/games/shelf/assets/eval_10_mugs_sawyer_shelf_22_07_19_35_06.xml @@ -0,0 +1,551 @@ + + + + + + + + diff --git a/games/shelf/assets/generate_sawyer_shelf_xml.py b/games/shelf/assets/generate_sawyer_shelf_xml.py new file mode 100644 index 00000000..78276116 --- /dev/null +++ b/games/shelf/assets/generate_sawyer_shelf_xml.py @@ -0,0 +1,475 @@ +from datetime import datetime +import os +import IPython +e = IPython.embed + + +def generate_and_save_xml_file(weights, action_mode, is_eval): + assert action_mode in ["joint_position", "joint_delta_position", "torque"] + + xml_str = generate_xml_str(weights, action_mode) + + now = datetime.now() + time_stamp_str = now.strftime("%d_%m_%H_%M_%S") + file_name = f"{len(weights)}_mugs_sawyer_shelf_{time_stamp_str}.xml" + if is_eval: + file_name = "eval_" + file_name + else: + file_name = "train_" + file_name + destination_dir = os.path.dirname(os.path.abspath(__file__)) # same as this file's + full_path = os.path.join(destination_dir, file_name) + open(full_path, "w").write(xml_str) + + print("xml saved in ", full_path) + return full_path + + +def generate_xml_str(weights, action_mode): + weights = [w[0] for w in weights] # not the task has target pos in + all_mugs = "" + for i, w in enumerate(weights): + one_mug = f""" + + + + + + + + """ + all_mugs += one_mug + + + joint_qpos = "-0.029997 -0.56 0.0299998 2.10036 0.11904 -1.16064 -1.46072 0.0092 -0.00705" + all_mug_keys = "" + for i, w in enumerate(weights): + mug_i_key = f"0 {-0.1*i - 0.4} 0.007 1 0 0 0 " # the proper place they should stay + all_mug_keys += mug_i_key + all_mug_keys = all_mug_keys[:-1] # delete last space + + all_keys = f""" + + + """ + + print(f"\nCONTROL MODE {action_mode}\n") + if action_mode == "torque": + return PART1_torque + all_mugs + PART2 + all_keys + PART3 + elif action_mode in ["joint_position", "joint_delta_position"]: + return PART1_pos + all_mugs + PART2 + all_keys + PART3 + + +PART1_torque = """ + + + + + +