From 5346eee2a74a3f7adc7d1881991eb9bc08e131a9 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Wed, 20 Dec 2023 15:01:16 +0800 Subject: [PATCH 1/2] init v0.2.0 --- README.md | 2 +- openrl/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2b175f86..b58c3ea6 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ [![Embark](https://img.shields.io/badge/discord-OpenRL-%237289da.svg?logo=discord)](https://discord.gg/qMbVT2qBhr) [![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&)](https://join.slack.com/t/openrlhq/shared_invite/zt-1tqwpvthd-Eeh0IxQ~DIaGqYXoW2IUQg) -OpenRL-v0.1.10 is updated on Oct 27, 2023 +OpenRL-v0.2.0 is updated on Dec 20, 2023 The main branch is the latest version of OpenRL, which is under active development. If you just want to have a try with OpenRL, you can switch to the stable branch. diff --git a/openrl/__init__.py b/openrl/__init__.py index 53ded95e..2ea67943 100644 --- a/openrl/__init__.py +++ b/openrl/__init__.py @@ -1,5 +1,5 @@ __TITLE__ = "openrl" -__VERSION__ = "v0.1.10" +__VERSION__ = "v0.2.0" __DESCRIPTION__ = "Distributed Deep RL Framework" __AUTHOR__ = "OpenRL Contributors" __EMAIL__ = "huangshiyu@4paradigm.com" From 2b798c08e473db58836b1c74c75f59f074c2fd50 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Wed, 20 Dec 2023 15:01:36 +0800 Subject: [PATCH 2/2] init v0.2.0 --- examples/envpool/envpool_wrappers.py | 9 ++++--- examples/envpool/make_env.py | 11 +++++---- examples/envpool/train_ppo.py | 4 ++-- openrl/envs/common/build_envs.py | 2 +- openrl/envs/common/registration.py | 2 +- openrl/envs/nlp/daily_dialog_env.py | 23 +++++++++--------- openrl/envs/nlp/fake_dialog_env.py | 22 +++++++++-------- openrl/envs/nlp/rewards/intent.py | 5 ++-- openrl/envs/nlp/rewards/kl_penalty.py | 15 ++++++------ openrl/envs/nlp/utils/metrics/meteor.py | 24 +++++++++++-------- openrl/modules/networks/policy_network_gpt.py | 1 - openrl/modules/networks/value_network_gpt.py | 1 - openrl/modules/utils/valuenorm.py | 12 +++++++--- 13 files changed, 72 insertions(+), 59 deletions(-) diff --git a/examples/envpool/envpool_wrappers.py b/examples/envpool/envpool_wrappers.py index d0da090a..bf975166 100644 --- a/examples/envpool/envpool_wrappers.py +++ b/examples/envpool/envpool_wrappers.py @@ -9,8 +9,7 @@ from packaging import version from stable_baselines3.common.vec_env import VecEnvWrapper as BaseWrapper from stable_baselines3.common.vec_env import VecMonitor -from stable_baselines3.common.vec_env.base_vec_env import (VecEnvObs, - VecEnvStepReturn) +from stable_baselines3.common.vec_env.base_vec_env import VecEnvObs, VecEnvStepReturn is_legacy_gym = version.parse(gym.__version__) < version.parse("0.26.0") @@ -114,9 +113,9 @@ def __init__( if is_wrapped_with_monitor: warnings.warn( - "The environment is already wrapped with a `Monitor` wrapper" - "but you are wrapping it with a `VecMonitor` wrapper, the `Monitor` statistics will be" - "overwritten by the `VecMonitor` ones.", + "The environment is already wrapped with a `Monitor` wrapperbut you are" + " wrapping it with a `VecMonitor` wrapper, the `Monitor` statistics" + " will beoverwritten by the `VecMonitor` ones.", UserWarning, ) diff --git a/examples/envpool/make_env.py b/examples/envpool/make_env.py index 92c1b51a..669ca67a 100644 --- a/examples/envpool/make_env.py +++ b/examples/envpool/make_env.py @@ -5,9 +5,12 @@ import envpool from gymnasium import Env - -from openrl.envs.vec_env import (AsyncVectorEnv, RewardWrapper, - SyncVectorEnv, VecMonitorWrapper) +from openrl.envs.vec_env import ( + AsyncVectorEnv, + RewardWrapper, + SyncVectorEnv, + VecMonitorWrapper, +) from openrl.envs.vec_env.vec_info import VecInfoFactory from openrl.envs.wrappers.base_wrapper import BaseWrapper from openrl.rewards import RewardFactory @@ -76,7 +79,7 @@ def make_envpool_envs( assert kwargs.get("env_type") in ["gym", "dm", "gymnasium"] kwargs["envpool"] = True - if 'env_wrappers' in kwargs: + if "env_wrappers" in kwargs: env_wrappers = kwargs.pop("env_wrappers") else: env_wrappers = [] diff --git a/examples/envpool/train_ppo.py b/examples/envpool/train_ppo.py index a02151f7..b6550b96 100644 --- a/examples/envpool/train_ppo.py +++ b/examples/envpool/train_ppo.py @@ -16,10 +16,10 @@ """""" import numpy as np - -from openrl.configs.config import create_config_parser from make_env import make + from examples.envpool.envpool_wrappers import VecAdapter, VecMonitor +from openrl.configs.config import create_config_parser from openrl.modules.common import PPONet as Net from openrl.modules.common.ppo_net import PPONet as Net from openrl.runners.common import PPOAgent as Agent diff --git a/openrl/envs/common/build_envs.py b/openrl/envs/common/build_envs.py index a0c59c6f..76f4b35b 100644 --- a/openrl/envs/common/build_envs.py +++ b/openrl/envs/common/build_envs.py @@ -69,4 +69,4 @@ def _make_env() -> Env: return _make_env env_fns = [create_env(env_id, env_num, need_env_id) for env_id in range(env_num)] - return env_fns \ No newline at end of file + return env_fns diff --git a/openrl/envs/common/registration.py b/openrl/envs/common/registration.py index 1ee9b532..5d1ed645 100644 --- a/openrl/envs/common/registration.py +++ b/openrl/envs/common/registration.py @@ -173,4 +173,4 @@ def make( vec_info_class = VecInfoFactory.get_vec_info_class(vec_info_class, env) env = VecMonitorWrapper(vec_info_class, env) - return env \ No newline at end of file + return env diff --git a/openrl/envs/nlp/daily_dialog_env.py b/openrl/envs/nlp/daily_dialog_env.py index 2aa08684..d197a232 100644 --- a/openrl/envs/nlp/daily_dialog_env.py +++ b/openrl/envs/nlp/daily_dialog_env.py @@ -72,16 +72,18 @@ def __init__( # set the observation and action space here self._vocab_size = self.tokenizer.vocab_size - self.observation_space = DictSpace({ - "input_encoded_pt": spaces.Box( - low=0, - high=self._vocab_size, - shape=(self._max_text_length + self.max_steps,), - ), - "input_attention_mask_pt": spaces.Box( - low=0, high=1, shape=(self._max_text_length + self.max_steps,) - ), - }) + self.observation_space = DictSpace( + { + "input_encoded_pt": spaces.Box( + low=0, + high=self._vocab_size, + shape=(self._max_text_length + self.max_steps,), + ), + "input_attention_mask_pt": spaces.Box( + low=0, high=1, shape=(self._max_text_length + self.max_steps,) + ), + } + ) self.action_space = Discrete(n=self._vocab_size) # see https://github.com/huggingface/transformers/issues/4875 : rounding up to nearest power of 2 for better GPU efficiency @@ -112,7 +114,6 @@ def __init__( self.reward_function = None def set_reward(self, reward_fn=None): - self.reward_function = reward_fn def step_word(self, word: str) -> Tuple[Dict[str, torch.tensor], int, bool, dict]: diff --git a/openrl/envs/nlp/fake_dialog_env.py b/openrl/envs/nlp/fake_dialog_env.py index 27f9d8f4..02247bc0 100644 --- a/openrl/envs/nlp/fake_dialog_env.py +++ b/openrl/envs/nlp/fake_dialog_env.py @@ -30,16 +30,18 @@ def __init__( # set the observation and action space here self._vocab_size = 2 - self.observation_space = DictSpace({ - "input_encoded_pt": spaces.Box( - low=0, - high=self._vocab_size, - shape=(self._max_text_length + self.max_steps,), - ), - "input_attention_mask_pt": spaces.Box( - low=0, high=1, shape=(self._max_text_length + self.max_steps,) - ), - }) + self.observation_space = DictSpace( + { + "input_encoded_pt": spaces.Box( + low=0, + high=self._vocab_size, + shape=(self._max_text_length + self.max_steps,), + ), + "input_attention_mask_pt": spaces.Box( + low=0, high=1, shape=(self._max_text_length + self.max_steps,) + ), + } + ) self.action_space = Discrete(n=self._vocab_size) n = 2 diff --git a/openrl/envs/nlp/rewards/intent.py b/openrl/envs/nlp/rewards/intent.py index 2c82e96f..bc4da36c 100644 --- a/openrl/envs/nlp/rewards/intent.py +++ b/openrl/envs/nlp/rewards/intent.py @@ -41,10 +41,9 @@ def __init__( self.use_model_parallel = False if intent_model == "builtin_intent": - self._device = "cpu" - self.use_data_parallel = False - + self.use_data_parallel = False + from transformers import GPT2Config, GPT2LMHeadModel class TestTokenizer: diff --git a/openrl/envs/nlp/rewards/kl_penalty.py b/openrl/envs/nlp/rewards/kl_penalty.py index 3cfafd4b..c98c6bfb 100644 --- a/openrl/envs/nlp/rewards/kl_penalty.py +++ b/openrl/envs/nlp/rewards/kl_penalty.py @@ -47,10 +47,9 @@ def __init__( # reference model if ref_model == "builtin_ref": - self.device = "cpu" - self.use_data_parallel = False - + self.use_data_parallel = False + from transformers import GPT2Config, GPT2LMHeadModel config = GPT2Config() @@ -146,10 +145,12 @@ def __call__( rew = -self._alpha * kl_div infos = [] for kl in kl_div: - infos.append({ - "alpha": self._alpha, - "kl_div": kl.mean(), - }) + infos.append( + { + "alpha": self._alpha, + "kl_div": kl.mean(), + } + ) return rew, infos def _prepare_inputs_for_model( diff --git a/openrl/envs/nlp/utils/metrics/meteor.py b/openrl/envs/nlp/utils/metrics/meteor.py index c2345fa9..ab15e66d 100644 --- a/openrl/envs/nlp/utils/metrics/meteor.py +++ b/openrl/envs/nlp/utils/metrics/meteor.py @@ -88,16 +88,20 @@ def _info(self): citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=[ - datasets.Features({ - "predictions": datasets.Value("string", id="sequence"), - "references": datasets.Sequence( - datasets.Value("string", id="sequence"), id="references" - ), - }), - datasets.Features({ - "predictions": datasets.Value("string", id="sequence"), - "references": datasets.Value("string", id="sequence"), - }), + datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Sequence( + datasets.Value("string", id="sequence"), id="references" + ), + } + ), + datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence"), + } + ), ], codebase_urls=[ "https://github.com/nltk/nltk/blob/develop/nltk/translate/meteor_score.py" diff --git a/openrl/modules/networks/policy_network_gpt.py b/openrl/modules/networks/policy_network_gpt.py index 906f1fb5..193094a7 100644 --- a/openrl/modules/networks/policy_network_gpt.py +++ b/openrl/modules/networks/policy_network_gpt.py @@ -46,7 +46,6 @@ def __init__( disable_drop_out: bool = True, extra_args=None, ) -> None: - self.device = device self.use_fp16 = cfg.use_fp16 self.use_deepspeed = cfg.use_deepspeed diff --git a/openrl/modules/networks/value_network_gpt.py b/openrl/modules/networks/value_network_gpt.py index afffffc2..0c5b1154 100644 --- a/openrl/modules/networks/value_network_gpt.py +++ b/openrl/modules/networks/value_network_gpt.py @@ -44,7 +44,6 @@ def __init__( device=torch.device("cpu"), extra_args=None, ): - self.device = device self.use_fp16 = cfg.use_fp16 diff --git a/openrl/modules/utils/valuenorm.py b/openrl/modules/utils/valuenorm.py index 0367084a..43aaad9c 100644 --- a/openrl/modules/utils/valuenorm.py +++ b/openrl/modules/utils/valuenorm.py @@ -24,9 +24,15 @@ def __init__( self.per_element_update = per_element_update self.tpdv = dict(dtype=torch.float32, device=device) - self.running_mean = nn.Parameter(torch.zeros(input_shape), requires_grad=False).to(**self.tpdv) - self.running_mean_sq = nn.Parameter(torch.zeros(input_shape), requires_grad=False).to(**self.tpdv) - self.debiasing_term = nn.Parameter(torch.tensor(0.0), requires_grad=False).to(**self.tpdv) + self.running_mean = nn.Parameter( + torch.zeros(input_shape), requires_grad=False + ).to(**self.tpdv) + self.running_mean_sq = nn.Parameter( + torch.zeros(input_shape), requires_grad=False + ).to(**self.tpdv) + self.debiasing_term = nn.Parameter(torch.tensor(0.0), requires_grad=False).to( + **self.tpdv + ) # self.running_mean = nn.Parameter(torch.zeros(input_shape), requires_grad=False) # self.running_mean_sq = nn.Parameter(