diff --git a/mo_gym/breakable_bottles/breakable_bottles.py b/mo_gym/breakable_bottles/breakable_bottles.py index 2de827fd..6d85f5fe 100644 --- a/mo_gym/breakable_bottles/breakable_bottles.py +++ b/mo_gym/breakable_bottles/breakable_bottles.py @@ -1,17 +1,21 @@ +from os import terminal_size +from typing import Optional from gym import Env from gym.spaces import Dict, Discrete, MultiBinary, Box import numpy as np class BreakableBottles(Env): - metadata = {"render_modes": ["human", "rgb_array"]} + metadata = {"render_modes": ["human"]} # actions LEFT = 0 RIGHT = 1 PICKUP = 2 - def __init__(self, size=5, prob_drop=0.1, time_penalty=-1, bottle_reward=25, unbreakable_bottles=False, seed=None): + def __init__(self, render_mode: Optional[str] = None, size=5, prob_drop=0.1, time_penalty=-1, bottle_reward=25, unbreakable_bottles=False): + self.render_mode = render_mode + # settings self.prob_drop = prob_drop self.time_penalty = time_penalty @@ -101,18 +105,21 @@ def step(self, action): reward[2] = self.potential(observation) - old_potential info = {} - return observation, reward, terminal, info + if self.render_mode == "human": + self.render() + return observation, reward, terminal, False, info - def reset(self, seed=None, return_info=False, **kwargs): + def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None): super().reset(seed=seed) - self.np_random.seed(seed) self.r_star = 0 self.location = self.size - 1 self.bottles_carrying = 0 self.bottles_delivered = 0 self.bottles_dropped = [0]*(self.size - 2) state = self._get_obs() - return (state, {}) if return_info else state + if self.render_mode == "human": + self.render() + return state, {} def get_obs_idx(self, obs): multi_index = np.array([[obs["location"]], @@ -127,22 +134,15 @@ def _get_obs(self): "bottles_delivered": self.bottles_delivered, "bottles_dropped": self.bottles_dropped.copy()} - def render(self, mode="human"): - if mode == 'rgb_array': - return np.array([self.state[:3], *self.state[3]]) # return RGB frame suitable for video - elif mode == 'human': + def render(self): + if self.render_mode == 'human': print("-----") print(f"Location: {self.location}\nCarrying {self.bottles_carrying} bottles.\nDelivered {self.bottles_delivered} so far.\nBottles have been dropped at tiles {'1' if self.bottles_dropped[0] > 0 else ''} {'2' if self.bottles_dropped[1] > 0 else ''} {'3' if self.bottles_dropped[2] > 0 else ''}") print("-----") - else: - super(BreakableBottles, self).render(mode=mode) # just raise an exception def close(self): pass - def seed(self, seed=None): - self.seed = seed if not seed is None else np.random.randint(2**32) - def potential(self, obs): if sum(obs["bottles_dropped"]) > 0: return -1 diff --git a/mo_gym/deep_sea_treasure/deep_sea_treasure.py b/mo_gym/deep_sea_treasure/deep_sea_treasure.py index 9ac35b0e..5e935da4 100644 --- a/mo_gym/deep_sea_treasure/deep_sea_treasure.py +++ b/mo_gym/deep_sea_treasure/deep_sea_treasure.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import Optional import gym import numpy as np @@ -46,7 +47,8 @@ class DeepSeaTreasure(gym.Env): metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 4} - def __init__(self, dst_map=DEFAULT_MAP, float_state=False): + def __init__(self, render_mode: Optional[str] = None, dst_map=DEFAULT_MAP, float_state=False): + self.render_mode = render_mode self.size = 11 self.window_size = 512 self.window = None @@ -87,7 +89,7 @@ def is_valid_state(self, state): return True return False - def render(self, mode='human'): + def render(self): # The size of a single grid square in pixels pix_square_size = self.window_size / self.size if self.window is None: @@ -97,11 +99,11 @@ def render(self, mode='human'): self.treasure_img = pygame.image.load(str(Path(__file__).parent.absolute()) + '/assets/treasure.png') self.treasure_img = pygame.transform.scale(self.treasure_img, (pix_square_size, pix_square_size)) - if self.window is None and mode == "human": + if self.window is None and self.render_mode == "human": pygame.init() pygame.display.init() self.window = pygame.display.set_mode((self.window_size, self.window_size)) - if self.clock is None and mode == "human": + if self.clock is None and self.render_mode == "human": self.clock = pygame.time.Clock() self.font = pygame.font.SysFont(None, 30) @@ -142,7 +144,7 @@ def render(self, mode='human'): width=1, ) - if mode == "human": + if self.render_mode == "human": # The following line copies our drawings from `canvas` to the visible window self.window.blit(canvas, canvas.get_rect()) pygame.event.pump() @@ -151,7 +153,7 @@ def render(self, mode='human'): # We need to ensure that human-rendering occurs at the predefined framerate. # The following line will automatically add a delay to keep the framerate stable. self.clock.tick(self.metadata["render_fps"]) - else: # rgb_array + elif self.render_mode == 'rgb_array': return np.transpose( np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2) ) @@ -163,14 +165,15 @@ def get_state(self): state = self.current_state.copy() return state - def reset(self, seed=None, return_info=False, **kwargs): + def reset(self, seed=None, **kwargs): super().reset(seed=seed) - self.np_random.seed(seed) self.current_state = np.array([0, 0], dtype=np.int32) self.step_count = 0.0 state = self.get_state() - return (state, {}) if return_info else state + if self.render_mode == "human": + self.render() + return state, {} def step(self, action): next_state = self.current_state + self.dir[action] @@ -188,8 +191,9 @@ def step(self, action): vec_reward = np.array([treasure_value, time_penalty], dtype=np.float32) state = self.get_state() - - return state, vec_reward, terminal, {} + if self.render_mode == "human": + self.render() + return state, vec_reward, terminal, False, {} def close(self): if self.window is not None: diff --git a/mo_gym/four_room/four_room.py b/mo_gym/four_room/four_room.py index fd40da95..38d8b791 100644 --- a/mo_gym/four_room/four_room.py +++ b/mo_gym/four_room/four_room.py @@ -1,4 +1,5 @@ import random +from typing import Optional import gym import numpy as np @@ -24,6 +25,7 @@ GREEN = (0, 128, 0) BLACK = (0, 0, 0) + class FourRoom(gym.Env): """ A discretized version of the gridworld environment introduced in [1]. Here, an agent learns to @@ -41,7 +43,7 @@ class FourRoom(gym.Env): metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 4} - def __init__(self, maze=MAZE): + def __init__(self, render_mode: Optional[str] = None, maze=MAZE): """ Creates a new instance of the shapes environment. @@ -56,6 +58,7 @@ def __init__(self, maze=MAZE): 0, 1, .... 9 indicates the type of shape to be placed in the corresponding cell entries containing other characters are treated as regular empty cells """ + self.render_mode = render_mode self.window_size = 512 self.window = None self.clock = None @@ -88,12 +91,13 @@ def state_to_array(self, state): s = [element for tupl in state for element in tupl] return np.array(s, dtype=np.int32) - def reset(self, seed=None, return_info=False, **kwargs): + def reset(self, seed=None, **kwargs): super().reset(seed=seed) - self.np_random.seed(seed) self.state = (random.choice(self.initial), tuple(0 for _ in range(len(self.shape_ids)))) - return (self.state_to_array(self.state), {}) if return_info else self.state_to_array(self.state) + if self.render_mode == 'human': + self.render() + return self.state_to_array(self.state), {} def step(self, action): old_state = self.state @@ -111,14 +115,16 @@ def step(self, action): else: raise Exception('bad action {}'.format(action)) + terminated = False + # out of bounds, cannot move if col < 0 or col >= self.width or row < 0 or row >= self.height: - return self.state_to_array(self.state), np.zeros(len(self.all_shapes), dtype=np.float32), False, {} + return self.state_to_array(self.state), np.zeros(len(self.all_shapes), dtype=np.float32), terminated, False, {} # into a blocked cell, cannot move s1 = (row, col) if s1 in self.occupied: - return self.state_to_array(self.state), np.zeros(len(self.all_shapes), dtype=np.float32), False, {} + return self.state_to_array(self.state), np.zeros(len(self.all_shapes), dtype=np.float32), terminated, False, {} # can now move self.state = (s1, collected) @@ -126,14 +132,15 @@ def step(self, action): # into a goal cell if s1 == self.goal: phi = np.ones(len(self.all_shapes), dtype=np.float32) - return self.state_to_array(self.state), phi, True, {} + terminated = True + return self.state_to_array(self.state), phi, terminated, False, {} # into a shape cell if s1 in self.shape_ids: shape_id = self.shape_ids[s1] if collected[shape_id] == 1: # already collected this flag - return self.state_to_array(self.state), np.zeros(len(self.all_shapes), dtype=np.float32), False, {} + return self.state_to_array(self.state), np.zeros(len(self.all_shapes), dtype=np.float32), terminated, False, {} else: # collect the new flag collected = list(collected) @@ -141,10 +148,10 @@ def step(self, action): collected = tuple(collected) self.state = (s1, collected) phi = self.features(old_state, action, self.state) - return self.state_to_array(self.state), phi, False, {} + return self.state_to_array(self.state), phi, terminated, False, {} # into an empty cell - return self.state_to_array(self.state), np.zeros(len(self.all_shapes), dtype=np.float32), False, {} + return self.state_to_array(self.state), np.zeros(len(self.all_shapes), dtype=np.float32), terminated, False, {} def features(self, state, action, next_state): s1, _ = next_state @@ -160,15 +167,15 @@ def features(self, state, action, next_state): phi[nc] = np.ones(nc, dtype=np.float32) return phi - def render(self, mode='human'): + def render(self): # The size of a single grid square in pixels pix_square_size = self.window_size / 13 - if self.window is None and mode == "human": + if self.window is None and self.render_mode == "human": pygame.init() pygame.display.init() self.window = pygame.display.set_mode((self.window_size, self.window_size)) - if self.clock is None and mode == "human": + if self.clock is None and self.render_mode == "human": self.clock = pygame.time.Clock() canvas = pygame.Surface((self.window_size, self.window_size)) @@ -232,7 +239,7 @@ def render(self, mode='human'): width=1, ) - if mode == "human": + if self.render_mode == "human": # The following line copies our drawings from `canvas` to the visible window self.window.blit(canvas, canvas.get_rect()) pygame.event.pump() @@ -241,7 +248,7 @@ def render(self, mode='human'): # We need to ensure that human-rendering occurs at the predefined framerate. # The following line will automatically add a delay to keep the framerate stable. self.clock.tick(self.metadata["render_fps"]) - else: # rgb_array + elif self.render_mode == 'rgb_array': return np.transpose( np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2) ) diff --git a/mo_gym/fruit_tree/fruit_tree.py b/mo_gym/fruit_tree/fruit_tree.py index 14000736..fb7db411 100644 --- a/mo_gym/fruit_tree/fruit_tree.py +++ b/mo_gym/fruit_tree/fruit_tree.py @@ -268,13 +268,12 @@ def get_ind(self, pos): def get_tree_value(self, pos): return self.tree[self.get_ind(pos)] - def reset(self, seed=None, return_info=False, **kwargs): + def reset(self, seed=None, **kwargs): super().reset(seed=seed) - self.np_random.seed(seed) self.current_state = np.array([0, 0], dtype=np.int32) self.terminal = False - return (self.current_state.copy(), {}) if return_info else self.current_state.copy() + return self.current_state.copy(), {} def step(self, action): direction = { @@ -288,4 +287,4 @@ def step(self, action): if self.current_state[0] == self.tree_depth: self.terminal = True - return self.current_state.copy(), reward, self.terminal, {} + return self.current_state.copy(), reward, self.terminal, False, {} diff --git a/mo_gym/mario/mario.py b/mo_gym/mario/mario.py index ecece297..1d28c5df 100644 --- a/mo_gym/mario/mario.py +++ b/mo_gym/mario/mario.py @@ -23,7 +23,7 @@ def __init__(self, rom_mode='pixel', lost_levels=False, target=None, objectives= self.single_stage = True self.done_when_dead = True - def reset(self, seed=None, return_info=False, **kwargs): + def reset(self, seed=None, **kwargs): self._np_random, seed = seeding.np_random(seed) # this is not used self.coin = 0 self.x_pos = 0 @@ -31,7 +31,7 @@ def reset(self, seed=None, return_info=False, **kwargs): self.score = 0 self.stage_bonus = 0 self.lives = 2 - return (super().reset(), {}) if return_info else super().reset() + return super().reset(), {} def step(self, action): obs, reward, done, info = super().step(action) @@ -97,7 +97,7 @@ def step(self, action): info['score'] = info['score'] + self.stage_bonus - return obs, mor, bool(done), info + return obs, mor, bool(done), False, info if __name__ == '__main__': diff --git a/mo_gym/minecart/minecart.py b/mo_gym/minecart/minecart.py index ed7e6d9e..1a1120d9 100644 --- a/mo_gym/minecart/minecart.py +++ b/mo_gym/minecart/minecart.py @@ -6,6 +6,7 @@ from math import ceil from pathlib import Path from copy import deepcopy +from typing import Optional import gym import numpy as np @@ -171,9 +172,10 @@ class Minecart(gym.Env): metadata = {'render_modes': ['rgb_array', 'human'], 'render_fps': FPS} def __init__(self, + render_mode: Optional[str] = None, image_observation=False, config=str(Path(__file__).parent.absolute()) + '/mine_config.json'): - + self.render_mode = render_mode self.screen = None self.last_render_mode_used = None self.config = config @@ -495,8 +497,10 @@ def step(self, action, frame_skip=4, incremental_frame_skip=True): if change and self.image_observation: self.render_pygame() + if self.render_mode == 'human': + self.render() - return self.get_state(change), reward, self.end, {} + return self.get_state(change), reward, self.end, False, {} def mine(self): """Perform the MINE action @@ -564,14 +568,13 @@ def get_state(self, update=True): "pixels": self.get_pixels(update) } """ - def reset(self, seed=None, return_info=False, **kwargs): + def reset(self, seed=None, **kwargs): """Resets the environment to the start state Returns: [type] -- [description] """ super().reset(seed=seed) - self.np_random.seed(seed) if self.screen is None and self.image_observation: self.render(mode='rgb_array') # init pygame @@ -585,7 +588,9 @@ def reset(self, seed=None, return_info=False, **kwargs): self.cart.angle = 45 self.cart.departed = False self.end = False - return (self.get_state(), {}) if return_info else self.get_state() + if self.render_mode == 'human': + self.render() + return self.get_state(), {} def __str__(self): string = "Completed: {} ".format(self.end) @@ -596,11 +601,11 @@ def __str__(self): string += "Position: {} ".format(self.cart.pos) return string - def render(self, mode='human'): - if self.screen is None or self.last_render_mode_used != mode: - self.last_render_mode_used = mode + def render(self): + if self.screen is None or self.last_render_mode_used != self.render_mode: + self.last_render_mode_used = self.render_mode pygame.init() - self.screen = pygame.display.set_mode((WIDTH, HEIGHT), flags=pygame.HIDDEN if mode=='rgb_array' else 0) + self.screen = pygame.display.set_mode((WIDTH, HEIGHT), flags=pygame.HIDDEN if self.render_mode=='rgb_array' else 0) self.clock = pygame.time.Clock() self.initialize_mines() @@ -615,10 +620,10 @@ def render(self, mode='human'): if not self.image_observation: self.render_pygame() # if the obs is not an image, then step would not have rendered the screen - if mode == 'human': + if self.render_mode == 'human': self.clock.tick(FPS) pygame.display.update() - elif mode == 'rgb_array': + elif self.render_mode == 'rgb_array': string_image = pygame.image.tostring(self.screen, 'RGB') temp_surf = pygame.image.fromstring(string_image, (WIDTH, HEIGHT), 'RGB') tmp_arr = pygame.surfarray.array3d(temp_surf) @@ -754,11 +759,11 @@ def pareto_filter(costs, minimize=True): if __name__ == '__main__': - env = Minecart(image_observation=True) + env = Minecart(render_mode='human', image_observation=True) done = False env.reset() while True: - env.render(mode='human') + env.render() obs, r, done, info = env.step(env.action_space.sample()) #print(str(env)) if done: diff --git a/mo_gym/mountain_car/mountain_car.py b/mo_gym/mountain_car/mountain_car.py index 33b1f961..84816fa3 100644 --- a/mo_gym/mountain_car/mountain_car.py +++ b/mo_gym/mountain_car/mountain_car.py @@ -1,4 +1,5 @@ import math +from typing import Optional import numpy as np from gym import spaces @@ -7,8 +8,8 @@ class MOMountainCar(MountainCarEnv): - def __init__(self, goal_velocity=0): - super().__init__(goal_velocity) + def __init__(self, render_mode: Optional[str] = None, goal_velocity=0): + super().__init__(render_mode, goal_velocity) self.reward_space = spaces.Box(low=-1, high=1, shape=(3,), dtype=np.float32) @@ -23,12 +24,14 @@ def step(self, action: int): if position == self.min_position and velocity < 0: velocity = 0 - done = bool(position >= self.goal_position and velocity >= self.goal_velocity) + terminated = bool(position >= self.goal_position and velocity >= self.goal_velocity) #reward = -1.0 reward = np.zeros(3, dtype=np.float32) - reward[0] = 0.0 if done else -1.0 # time penalty + reward[0] = 0.0 if terminated else -1.0 # time penalty reward[1] = 0.0 if action != 0 else -1.0 # reverse penalty reward[2] = 0.0 if action != 2 else -1.0 # forward penalty self.state = (position, velocity) - return np.array(self.state, dtype=np.float32), reward, done, {} + if self.render_mode == "human": + self.render() + return np.array(self.state, dtype=np.float32), reward, terminated, False, {} diff --git a/mo_gym/mujoco/half_cheetah.py b/mo_gym/mujoco/half_cheetah.py index 837d2901..48647c0f 100644 --- a/mo_gym/mujoco/half_cheetah.py +++ b/mo_gym/mujoco/half_cheetah.py @@ -9,6 +9,6 @@ def __init__(self, **kwargs): self.reward_space = Box(low=-np.inf, high=np.inf, shape=(2,)) def step(self, action): - observation, reward, done, info = super().step(action) + observation, reward, terminated, truncated, info = super().step(action) vec_reward = np.array([info['reward_run'], info['reward_ctrl']], dtype=np.float32) - return observation, vec_reward, done, info + return observation, vec_reward, terminated, truncated, info diff --git a/mo_gym/mujoco/hopper.py b/mo_gym/mujoco/hopper.py index 94651a14..cabfc1f9 100644 --- a/mo_gym/mujoco/hopper.py +++ b/mo_gym/mujoco/hopper.py @@ -24,7 +24,7 @@ def step(self, action): observation = self._get_obs() #reward = rewards - costs - done = self.done + terminated = self.terminated z = self.data.qpos[1] height = 10*(z - self.init_qpos[1]) @@ -39,4 +39,6 @@ def step(self, action): "energy_reward": -energy_cost, } - return observation, vec_reward, done, info + if self.render_mode == "human": + self.render() + return observation, vec_reward, terminated, False, info diff --git a/mo_gym/reacher/reacher.py b/mo_gym/reacher/reacher.py index 5ef16866..751fe219 100644 --- a/mo_gym/reacher/reacher.py +++ b/mo_gym/reacher/reacher.py @@ -51,7 +51,7 @@ def step(self, a): self.HUD(state, real_action, False) - return state, phi, False, {} + return state, phi, False, False, {} def camera_adjust(self): x, y, z = self.robot.fingertip.pose().xyz() @@ -59,9 +59,9 @@ def camera_adjust(self): y *= 0.5 self.camera.move_and_look_at(0.3, 0.3, 0.3, x, y, z) - def reset(self, seed=None, return_info=False, **kwargs): - self.np_random.seed(seed) - return (super().reset(), {}) if return_info else super().reset() + def reset(self, seed=None, **kwargs): + self._seed(seed) + return super().reset(), {} class ReacherRobot(MJCFBasedRobot): diff --git a/mo_gym/resource_gathering/resource_gathering.py b/mo_gym/resource_gathering/resource_gathering.py index c9f35c28..47371ff8 100644 --- a/mo_gym/resource_gathering/resource_gathering.py +++ b/mo_gym/resource_gathering/resource_gathering.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import Optional import gym import numpy as np @@ -15,7 +16,8 @@ class ResourceGathering(gym.Env): metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 4} - def __init__(self): + def __init__(self, render_mode: Optional[str] = None): + self.render_mode = render_mode self.size = 5 self.window_size = 512 self.window = None @@ -52,7 +54,7 @@ def get_map_value(self, pos): def is_valid_state(self, state): return state[0] >= 0 and state[0] < self.size and state[1] >= 0 and state[1] < self.size - def render(self, mode='human'): + def render(self): # The size of a single grid square in pixels pix_square_size = self.window_size / self.size if self.window is None: @@ -67,11 +69,11 @@ def render(self, mode='human'): self.agent_img = pygame.image.load(str(Path(__file__).parent.absolute()) + '/assets/stickerman.png') self.agent_img = pygame.transform.scale(self.agent_img, (pix_square_size, pix_square_size)) - if self.window is None and mode == "human": + if self.window is None and self.render_mode == "human": pygame.init() pygame.display.init() self.window = pygame.display.set_mode((self.window_size, self.window_size)) - if self.clock is None and mode == "human": + if self.clock is None and self.render_mode == "human": self.clock = pygame.time.Clock() canvas = pygame.Surface((self.window_size, self.window_size)) @@ -106,7 +108,7 @@ def render(self, mode='human'): width=2, ) - if mode == "human": + if self.render_mode == "human": # The following line copies our drawings from `canvas` to the visible window self.window.blit(canvas, canvas.get_rect()) pygame.event.pump() @@ -115,7 +117,7 @@ def render(self, mode='human'): # We need to ensure that human-rendering occurs at the predefined framerate. # The following line will automatically add a delay to keep the framerate stable. self.clock.tick(self.metadata["render_fps"]) - else: # rgb_array + elif self.render_mode == 'rgb_array': # rgb_array return np.transpose( np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2) ) @@ -125,16 +127,17 @@ def get_state(self): state = np.concatenate((pos, np.array([self.has_gold, self.has_gem], dtype=np.int32))) return state - def reset(self, seed=None, return_info=False, **kwargs): + def reset(self, seed=None, **kwargs): super().reset(seed=seed) - self.np_random.seed(seed) self.current_pos = self.initial_pos self.has_gem = 0 self.has_gold = 0 self.step_count = 0.0 state = self.get_state() - return (state, {}) if return_info else state + if self.render_mode == 'human': + self.render() + return state, {} def step(self, action): next_pos = self.current_pos + self.dir[action] @@ -160,8 +163,9 @@ def step(self, action): vec_reward[2] = self.has_gem state = self.get_state() - - return state, vec_reward, done, {} + if self.render_mode == 'human': + self.render() + return state, vec_reward, done, False, {} def close(self): if self.window is not None: diff --git a/mo_gym/utils.py b/mo_gym/utils.py index 47fa1767..12182bed 100644 --- a/mo_gym/utils.py +++ b/mo_gym/utils.py @@ -32,12 +32,12 @@ def set_weight(self, weight): assert weight.shape == self.env.reward_space.shape, "Reward weight has different shape than reward vector." self.w = weight - def step(self, action: ActType) -> Tuple[ObsType, float, bool, dict]: - observation, reward, done, info = self.env.step(action) + def step(self, action: ActType) -> Tuple[ObsType, float, bool, bool, dict]: + observation, reward, terminated, truncated, info = self.env.step(action) scalar_reward = np.dot(reward, self.w) info['vector_reward'] = reward - return observation, scalar_reward, done, info + return observation, scalar_reward, terminated, truncated, info class MONormalizeReward(gym.Wrapper): @@ -66,17 +66,20 @@ def __init__(self, env: gym.Env, idx: int, gamma: float = 0.99, epsilon: float = def step(self, action: ActType): """Steps through the environment, normalizing the rewards returned.""" - obs, rews, dones, infos = self.env.step(action) + obs, rews, terminated, truncated, infos = self.env.step(action) + # Extracts the objective value to normalize to_normalize = rews[self.idx] if not self.is_vector_env: to_normalize = np.array([to_normalize]) self.returns = self.returns * self.gamma + to_normalize + # Defer normalization to gym implementation to_normalize = self.normalize(to_normalize) - self.returns[dones] = 0.0 + self.returns[terminated] = 0.0 if not self.is_vector_env: to_normalize = to_normalize[0] + # Injecting the normalized objective value back into the reward vector rews[self.idx] = to_normalize - return obs, rews, dones, infos + return obs, rews, terminated, truncated, infos def normalize(self, rews): """Normalizes the rewards with the running mean rewards and their variance.""" @@ -126,7 +129,7 @@ def add_vector_episode_statistics( info (dict): info dict of the environment. episode_info (dict): episode statistics data. num_envs (int): number of environments. - num_envs (int): number of objectives. + num_objs (int): number of objectives. env_num (int): env number of the vectorized environments. Returns: @@ -170,19 +173,28 @@ def reset(self, **kwargs): def step(self, action): """Steps through the environment, recording the episode statistics.""" # This is the code from the RecordEpisodeStatistics wrapper from gym. - observations, rewards, dones, infos = self.env.step(action) + ( + observations, + rewards, + terminateds, + truncateds, + infos, + ) = self.env.step(action) assert isinstance( infos, dict ), f"`info` dtype is {type(infos)} while supported dtype is `dict`. This may be due to usage of other wrappers in the wrong order." self.episode_returns += rewards + # The discounted returns are also computed here self.disc_episode_returns += (rewards * np.repeat(self.gamma ** self.episode_lengths, self.reward_dim).reshape(self.episode_returns.shape)) self.episode_lengths += 1 if not self.is_vector_env: - dones = [dones] - dones = list(dones) + terminateds = [terminateds] + truncateds = [truncateds] + terminateds = list(terminateds) + truncateds = list(truncateds) - for i in range(len(dones)): - if dones[i]: + for i in range(len(terminateds)): + if terminateds[i] or truncateds[i]: episode_return = deepcopy(self.episode_returns[i]) # Makes a deepcopy to avoid subsequent mutations disc_episode_return = deepcopy(self.disc_episode_returns[i]) # Makes a deepcopy to avoid subsequent mutations episode_length = self.episode_lengths[i] @@ -209,7 +221,8 @@ def step(self, action): return ( observations, rewards, - dones if self.is_vector_env else dones[0], + terminateds if self.is_vector_env else terminateds[0], + truncateds if self.is_vector_env else truncateds[0], infos, ) diff --git a/setup.py b/setup.py index 678e255a..ba14956a 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name="mo-gym", - version="0.1.2", + version="0.2.0", description="Environments for Multi-Objective RL.", url="https://www.github.com/LucasAlegre/mo-gym", author="LucasAlegre", @@ -21,7 +21,7 @@ packages=packages, package_data=package_data, install_requires=[ - "gym==0.24.1", # 0.25 has breaking changes + "gym>=0.26", "numpy", "pygame", "scipy", @@ -39,4 +39,10 @@ ], ) +# python setup.py sdist +# python setup.py bdist_wheel +# twine upload --repository-url https://upload.pypi.org/legacy/ dist/* +# twine upload --repository-url https://test.pypi.org/legacy/ dist/* +# twine upload dist/* + # https://towardsdatascience.com/create-your-own-python-package-and-publish-it-into-pypi-9306a29bc116 \ No newline at end of file diff --git a/tests/test_wrappers.py b/tests/test_wrappers.py index 14c60633..1d504b6f 100644 --- a/tests/test_wrappers.py +++ b/tests/test_wrappers.py @@ -11,7 +11,7 @@ def go_to_8_3(env): env.reset() env.step(3) # right env.step(1) # down - _, rewards, _, infos = env.step(1) + _, rewards, _, _, infos = env.step(1) return rewards, infos @@ -24,7 +24,7 @@ def test_normalization_wrapper(): for i in range(30): go_to_8_3(both_norm_env) both_norm_env.reset() - _, rewards, _, _ = both_norm_env.step(1) # down + _, rewards, _, _, _ = both_norm_env.step(1) # down np.testing.assert_allclose(rewards, [0.18, -1.24], rtol=0, atol=1e-2) rewards, _ = go_to_8_3(both_norm_env) np.testing.assert_allclose(rewards, [2.13, -1.24], rtol=0, atol=1e-2) @@ -33,7 +33,7 @@ def test_normalization_wrapper(): for i in range(30): go_to_8_3(norm_treasure_env) norm_treasure_env.reset() - _, rewards, _, _ = norm_treasure_env.step(1) # down + _, rewards, _, _, _ = norm_treasure_env.step(1) # down # Time rewards are not normalized (-1) np.testing.assert_allclose(rewards, [0.18, -1.], rtol=0, atol=1e-2) rewards, _ = go_to_8_3(norm_treasure_env) @@ -47,14 +47,14 @@ def test_clip_wrapper(): # Tests for both rewards clipped both_clipped_env.reset() - _, rewards, _, _ = both_clipped_env.step(1) # down + _, rewards, _, _, _ = both_clipped_env.step(1) # down np.testing.assert_allclose(rewards, [0.5, -0.5], rtol=0, atol=1e-2) rewards, _ = go_to_8_3(both_clipped_env) np.testing.assert_allclose(rewards, [0.5, -0.5], rtol=0, atol=1e-2) # Tests for only treasure clipped clip_treasure_env.reset() - _, rewards, _, _ = clip_treasure_env.step(1) # down + _, rewards, _, _, _ = clip_treasure_env.step(1) # down # Time rewards are not clipped (-1) np.testing.assert_allclose(rewards, [0.5, -1.], rtol=0, atol=1e-2) rewards, _ = go_to_8_3(clip_treasure_env) @@ -76,10 +76,11 @@ def thunk(): ]) envs.reset() - obs, rewards, dones, infos = envs.step(envs.action_space.sample()) + obs, rewards, terminateds, truncateds, infos = envs.step(envs.action_space.sample()) assert len(obs) == num_envs, "Number of observations do not match the number of envs" assert len(rewards) == num_envs, "Number of rewards do not match the number of envs" - assert len(dones) == num_envs, "Number of dones do not match the number of envs" + assert len(terminateds) == num_envs, "Number of terminateds do not match the number of envs" + assert len(truncateds) == num_envs, "Number of truncateds do not match the number of envs" def test_mo_record_ep_statistic(): env = mo_gym.make("deep-sea-treasure-v0") @@ -113,9 +114,9 @@ def thunk(): envs = MORecordEpisodeStatistics(envs) envs.reset() - dones = np.array([False] * num_envs) - while not np.any(dones): - obs, rewards, dones, info = envs.step(envs.action_space.sample()) + terminateds = np.array([False] * num_envs) + while not np.any(terminateds): + obs, rewards, terminateds, _, info = envs.step(envs.action_space.sample()) print(info) assert(isinstance(info["episode"]["r"], np.ndarray))