diff --git a/reinforcement-learning/orion-hpc/example.ipynb b/reinforcement-learning/orion-hpc/example.ipynb index a2b5b33..a0e1654 100644 --- a/reinforcement-learning/orion-hpc/example.ipynb +++ b/reinforcement-learning/orion-hpc/example.ipynb @@ -16,7 +16,7 @@ "cell_type": "code", "outputs": [], "source": [ - "WEIGHTS = './weights/weights-[NUMBER]' # NB: without '.pth'" + "WEIGHTS = './weights/weights-5000' # NB: without '.pth'" ], "metadata": { "collapsed": false @@ -46,16 +46,16 @@ "source": [ "network = {\n", " \"input_channels\": 1, \"outputs\": 5,\n", - " \"channels\": [32, 64, 64],\n", - " \"kernels\": [5, 3, 3],\n", - " \"strides\": [3, 2, 1],\n", - " \"nodes\": [64]\n", + " \"channels\": [32, 32],\n", + " \"kernels\": [8, 5],\n", + " \"strides\": [4, 2],\n", "}\n", "optimizer = {\n", " \"optimizer\": torch.optim.AdamW,\n", " \"lr\": 0.001,\n", " \"hyperparameters\": {}\n", - "}" + "}\n", + "reshape = (1, 1, 100, 100)" ], "metadata": { "collapsed": false @@ -68,7 +68,7 @@ "outputs": [], "source": [ "value_agent = VisionDeepQ(\n", - " network=network, optimizer=optimizer,\n", + " network=network, optimizer=optimizer, shape=reshape,\n", ")\n", "\n", "weights = torch.load(f'{WEIGHTS}.pth', map_location=torch.device('cpu'))\n", diff --git a/reinforcement-learning/orion-hpc/upload/agent.py b/reinforcement-learning/orion-hpc/upload/agent.py index 134ba83..f34ea54 100644 --- a/reinforcement-learning/orion-hpc/upload/agent.py +++ b/reinforcement-learning/orion-hpc/upload/agent.py @@ -57,6 +57,20 @@ def __init__(self, other : dict Additional parameters. + exploration_rate : float, optional + Initial exploration rate. + exploration_min : float, optional + Minimum exploration rate. + exploration_steps : int, optional + Number of steps before `exploration_min` is reached. + punishment : float, optional + Punishment for losing a game. + E.g., `-10` reward for losing a game. + incentive : float, optional + Incentive scaling for rewards. + Boosts the rewards gained by a factor. + memory : int, optional + Number of recent games to keep in memory. discount : float, optional Discount factor for future rewards. --> 0: only consider immediate rewards @@ -141,12 +155,17 @@ def __init__(self, # control through deep reinforcement learning" (2015). self.parameter = { + "shape": shape, + "rate": other.get("exploration_rate", 0.9), "min": other.get("exploration_min", 0.01), "decay": (other.get("exploration_rate", 0.9) - other.get("exploration_min", 0.01)) / other.get("exploration_steps", 1500), + "punishment": other.get("punishment", -1), + "incentive": other.get("incentive", 1), + "discount": other.get("discount", 0.99), "gamma": other.get("gamma", 0.95), @@ -180,6 +199,9 @@ def forward(self, state): state = state.to(self.device) / torch.tensor(255, dtype=torch.float32, device=self.device) + state = torch.nn.functional.interpolate(state, + size=self.parameter["shape"][2:]) + _output = torch.relu(self.layer_0(state)) for i in range(1, len(self._modules) - 1): if i > self.parameter["convolutions"]: @@ -258,9 +280,11 @@ def learn(self, network): _reward = 0 for i in reversed(range(len(rewards))): - _reward = 0 if i in steps else _reward - _reward = _reward * self.parameter["discount"] + rewards[i] + _reward = self.parameter["punishment"] if i in steps else _reward + _reward = (_reward * self.parameter["discount"] + + rewards[i] * self.parameter["incentive"]) rewards[i] = _reward + rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-7)).view(-1, 1) # Q-LEARNING @@ -305,7 +329,7 @@ def learn(self, network): self.parameter["optimizer"].zero_grad() loss.backward() - # # Clamping gradients as per the Google DeepMind paper. + # Clamping gradients as per the Google DeepMind paper. for param in self.parameters(): param.grad.data.clamp_(-1, 1) diff --git a/reinforcement-learning/orion-hpc/upload/train.py b/reinforcement-learning/orion-hpc/upload/train.py index 16c9519..2ebbd86 100644 --- a/reinforcement-learning/orion-hpc/upload/train.py +++ b/reinforcement-learning/orion-hpc/upload/train.py @@ -6,8 +6,10 @@ import os import re +import sys import copy import time +import signal import logging import torch @@ -51,28 +53,32 @@ GAMES = 50000 SHAPE = (1, 1, 210, 160) +RESHAPE = (1, 1, 100, 100) -DISCOUNT = 0.98 +DISCOUNT = 0.95 GAMMA = 0.99 -EXPLORATION_RATE = 1.0 -EXPLORATION_MIN = 0.01 -EXPLORATION_STEPS = 20000 +PUNISHMENT = -5 +INCENTIVE = 1 -MINIBATCH = 48 +MINIBATCH = 64 TRAIN_EVERY = 5 START_TRAINING_AT = 2500 +EXPLORATION_RATE = 0.88 +EXPLORATION_MIN = 0.01 +EXPLORATION_STEPS = 3000 // TRAIN_EVERY + REMEMBER_ALL = False MEMORY = 250 -RESET_Q_EVERY = 250 +RESET_Q_EVERY = 150 NETWORK = { "input_channels": 1, "outputs": 5, - "channels": [32, 64, 64], - "kernels": [5, 3, 3], - "strides": [3, 2, 1], - "nodes": [64] + "channels": [32, 32], + "kernels": [8, 5], + "strides": [4, 2], + # "nodes": [] } OPTIMIZER = { "optimizer": torch.optim.AdamW, @@ -88,13 +94,15 @@ value_agent = VisionDeepQ( network=NETWORK, optimizer=OPTIMIZER, - batch_size=MINIBATCH, shape=SHAPE, + batch_size=MINIBATCH, shape=RESHAPE, other={ "discount": DISCOUNT, "gamma": GAMMA, "memory": MEMORY, + "incentive": INCENTIVE, "punishment": PUNISHMENT, + "exploration_rate": EXPLORATION_RATE, "exploration_steps": EXPLORATION_STEPS, "exploration_min": EXPLORATION_MIN @@ -136,7 +144,7 @@ # Misc # -------------------------------------------------------------------------------------------------- -CHECKPOINT = GAMES // 25 +CHECKPOINT = GAMES // 50 METRICS = { "steps": torch.zeros(GAMES), "losses": torch.zeros(GAMES // TRAIN_EVERY), @@ -194,8 +202,8 @@ if REMEMBER_ALL or REWARDS > 0: logger.debug(" Memorizing game") value_agent.memorize(state, STEPS) - logger.info(" %s > Rewards: %s Steps: %s Memory: %s %%", - game, int(REWARDS), STEPS, len(value_agent.memory["memory"]) * 100 / MEMORY) + logger.info(" %s > Rewards: %s Steps: %s Memory: %s", + game, int(REWARDS), STEPS, len(value_agent.memory["memory"])) else: logger.debug(" Not memorizing game") value_agent.memory["game"].clear() diff --git a/reinforcement-learning/orion-hpc/weights/weights-5000.pth b/reinforcement-learning/orion-hpc/weights/weights-5000.pth new file mode 100644 index 0000000..5d85b6a Binary files /dev/null and b/reinforcement-learning/orion-hpc/weights/weights-5000.pth differ diff --git a/reinforcement-learning/policy-based/agent.py b/reinforcement-learning/policy-based/agent.py index 6430dec..62b670f 100644 --- a/reinforcement-learning/policy-based/agent.py +++ b/reinforcement-learning/policy-based/agent.py @@ -146,7 +146,7 @@ def learn(self): for i in reversed(range(len(rewards))): _reward = _reward * self.discount + rewards[i] rewards[i] = _reward - rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-9) + rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7) # POLICY GRADIENT # ------------------------------------------------------------------------------------------ diff --git a/reinforcement-learning/policy-based/mlx/agent.py b/reinforcement-learning/policy-based/mlx/agent.py index 8ddd747..56c33fb 100644 --- a/reinforcement-learning/policy-based/mlx/agent.py +++ b/reinforcement-learning/policy-based/mlx/agent.py @@ -171,7 +171,7 @@ def learn(self): mean = mx.mean(rewards) std = mx.sqrt(mx.sum((rewards - mean) ** 2)/rewards.shape[0]) - rewards = (rewards - mean) / (std + 1e-9) + rewards = (rewards - mean) / (std + 1e-7) # POLICY GRADIENT # ------------------------------------------------------------------------------------------ diff --git a/reinforcement-learning/value-based/agent_basic.py b/reinforcement-learning/value-based/agent_basic.py index ec4d8ee..b65e4a4 100644 --- a/reinforcement-learning/value-based/agent_basic.py +++ b/reinforcement-learning/value-based/agent_basic.py @@ -180,7 +180,7 @@ def learn(self, network): _reward = 0 if i in steps else _reward _reward = _reward * self.discount + rewards[i] rewards[i] = _reward - rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-9)).view(-1, 1) + rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-7)).view(-1, 1) # Q-LEARNING # ------------------------------------------------------------------------------------------ diff --git a/reinforcement-learning/value-based/agent_elaborate.py b/reinforcement-learning/value-based/agent_elaborate.py index b32c164..eccf6cd 100644 --- a/reinforcement-learning/value-based/agent_elaborate.py +++ b/reinforcement-learning/value-based/agent_elaborate.py @@ -49,10 +49,26 @@ def __init__(self, other : dict Additional parameters. + exploration_rate : float, optional + Initial exploration rate. + exploration_min : float, optional + Minimum exploration rate. + exploration_steps : int, optional + Number of steps before `exploration_min` is reached. + punishment : float, optional + Punishment for losing a game. + E.g., `-10` reward for losing a game. + incentive : float, optional + Incentive scaling for rewards. + Boosts the rewards gained by a factor. + memory : int, optional + Number of recent games to keep in memory. discount : float, optional Discount factor for future rewards. --> 0: only consider immediate rewards --> 1: consider all future rewards equally + gamma : float, optional + Discount factor for Q-learning. """ super().__init__() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -72,17 +88,22 @@ def __init__(self, # Default discount factor is 0.99, as suggested by the Google DeepMind paper "Human-level # control through deep reinforcement learning" (2015). - self.explore = { + self.parameter = { "rate": other.get("exploration_rate", 0.9), - "decay": other.get("exploration_decay", 0.999), "min": other.get("exploration_min", 0.01), + "decay": + (other.get("exploration_rate", 0.9) - other.get("exploration_min", 0.01)) + / other.get("exploration_steps", 1500), + + "punishment": other.get("punishment", -10), + "incentive": other.get("incentive", 100), "discount": other.get("discount", 0.99), "gamma": other.get("gamma", 0.95), - } - self.optimizer = optimizer["optimizer"](self.parameters(), lr=optimizer["lr"], + "optimizer": optimizer["optimizer"](self.parameters(), lr=optimizer["lr"], **optimizer.get("hyperparameters", {})) + } self.batch_size = batch_size self.memory = deque(maxlen=other.get("memory", 2500)) @@ -130,8 +151,7 @@ def action(self, state): actions : torch.Tensor Q-values for each action. """ - - if np.random.rand() < self.explore["rate"]: + if np.random.rand() < self.parameter["rate"]: action = torch.tensor([np.random.choice( range(next(reversed(self._modules.values())).out_features) )], dtype=torch.long) @@ -183,10 +203,12 @@ def learn(self, network): _reward = 0 for i in reversed(range(len(rewards))): - _reward = 0 if i in steps else _reward - _reward = _reward * self.explore["discount"] + rewards[i] + _reward = self.parameter["punishment"] if i in steps else _reward + _reward = (_reward * self.parameter["discount"] + + rewards[i] * self.parameter["incentive"]) rewards[i] = _reward - rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-9)).view(-1, 1) + + rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-7)).view(-1, 1) # Q-LEARNING # ------------------------------------------------------------------------------------------ @@ -210,7 +232,7 @@ def learn(self, network): with torch.no_grad(): optimal = (rewards + - self.explore["gamma"] * network(new_states).max(1).values.view(-1, 1)) + self.parameter["gamma"] * network(new_states).max(1).values.view(-1, 1)) # As Google DeepMind suggests, the optimal Q-value is set to r if the game is over. for step in steps: @@ -219,17 +241,22 @@ def learn(self, network): # BACKPROPAGATION # ------------------------------------------------------------------------------------------ - loss = torch.nn.functional.smooth_l1_loss(actual, optimal) + loss = torch.nn.functional.mse_loss(actual, optimal) - self.optimizer.zero_grad() + self.parameter["optimizer"].zero_grad() loss.backward() - self.optimizer.step() + + # Clamping gradients as per the Google DeepMind paper. + for param in self.parameters(): + param.grad.data.clamp_(-1, 1) + + self.parameter["optimizer"].step() # EXPLORATION RATE DECAY # ------------------------------------------------------------------------------------------ - self.explore["rate"] = max(self.explore["decay"] * self.explore["rate"], - self.explore["min"]) + self.parameter["rate"] = max(self.parameter["rate"] - self.parameter["decay"], + self.parameter["min"]) del states, actions, new_states, rewards, _reward, actual, optimal diff --git a/reinforcement-learning/value-based/agent_image.py b/reinforcement-learning/value-based/agent_image.py index 4e13d6c..780d169 100644 --- a/reinforcement-learning/value-based/agent_image.py +++ b/reinforcement-learning/value-based/agent_image.py @@ -56,6 +56,20 @@ def __init__(self, other : dict Additional parameters. + exploration_rate : float, optional + Initial exploration rate. + exploration_min : float, optional + Minimum exploration rate. + exploration_steps : int, optional + Number of steps before `exploration_min` is reached. + punishment : float, optional + Punishment for losing a game. + E.g., `-10` reward for losing a game. + incentive : float, optional + Incentive scaling for rewards. + Boosts the rewards gained by a factor. + memory : int, optional + Number of recent games to keep in memory. discount : float, optional Discount factor for future rewards. --> 0: only consider immediate rewards @@ -106,8 +120,6 @@ def __init__(self, setattr(self, f"layer_{i}", torch.nn.Conv2d(_in, _out, kernel_size=_kernel, stride=_stride)) - self.convolutions = len(network["channels"]) - len(network.get("nodes", [])) - # Calculating the output shape of convolutional layers: # ------------------------------------------------------------------------------------------ @@ -140,20 +152,24 @@ def __init__(self, # Default discount factor is 0.99, as suggested by the Google DeepMind paper "Human-level # control through deep reinforcement learning" (2015). - eps_rate = other.get("exploration_rate", 0.9) - eps_steps = other.get("exploration_steps", 1500) - eps_min = other.get("exploration_min", 0.01) self.parameter = { - "rate": eps_rate, - "decay": (eps_rate - eps_min) / eps_steps, - "min": eps_min, + "rate": other.get("exploration_rate", 0.9), + "min": other.get("exploration_min", 0.01), + "decay": + (other.get("exploration_rate", 0.9) - other.get("exploration_min", 0.01)) + / other.get("exploration_steps", 1500), + + "punishment": other.get("punishment", -10), + "incentive": other.get("incentive", 100), "discount": other.get("discount", 0.99), "gamma": other.get("gamma", 0.95), - } - self.optimizer = optimizer["optimizer"](self.parameters(), lr=optimizer["lr"], + "convolutions": len(network["channels"]) - len(network.get("nodes", [])), + + "optimizer": optimizer["optimizer"](self.parameters(), lr=optimizer["lr"], **optimizer.get("hyperparameters", {})) + } self.memory = { "batch_size": batch_size, @@ -257,13 +273,14 @@ def learn(self, network): # achieved by reversely adding the observed reward and the discounted cumulative future # rewards. The rewards are then standardized. - _rewards = torch.zeros_like(rewards) - for i in reversed(range(len(_rewards))): - _reward = _steps[steps.index(i)] if i in steps else _rewards[i + 1] - _rewards[i] = _reward * self.parameter["discount"] + rewards[i] + _reward = 0 + for i in reversed(range(len(rewards))): + _reward = self.parameter["punishment"] if i in steps else _reward + _reward = (_reward * self.parameter["discount"] + + rewards[i] * self.parameter["incentive"]) + rewards[i] = _reward - rewards = (((_rewards - _rewards.mean()) / (_rewards.std() + 1e-9)) - .view(-1, 1).to(self.device)) + rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-7)).view(-1, 1) # Q-LEARNING # ------------------------------------------------------------------------------------------ @@ -301,14 +318,14 @@ def learn(self, network): loss = torch.nn.functional.mse_loss(actual, optimal) - self.optimizer.zero_grad() + self.parameter["optimizer"].zero_grad() loss.backward() # # Clamping gradients as per the Google DeepMind paper. - # for param in self.parameters(): - # param.grad.data.clamp_(-1, 1) + for param in self.parameters(): + param.grad.data.clamp_(-1, 1) - self.optimizer.step() + self.parameter["optimizer"].step() # EXPLORATION RATE DECAY # ------------------------------------------------------------------------------------------ diff --git a/reinforcement-learning/value-based/agent_video.py b/reinforcement-learning/value-based/agent_video.py index b372e56..60a4332 100644 --- a/reinforcement-learning/value-based/agent_video.py +++ b/reinforcement-learning/value-based/agent_video.py @@ -319,7 +319,7 @@ def learn(self, network): _reward = 0 if i in steps else _reward _reward = _reward * self.parameter["discount"] + rewards[i] rewards[i] = _reward - rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-9)).view(-1, 1) + rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-7)).view(-1, 1) # Q-LEARNING # ------------------------------------------------------------------------------------------ diff --git a/reinforcement-learning/value-based/example-image-tetris.ipynb b/reinforcement-learning/value-based/example-image-tetris.ipynb index 755ed88..21e57a4 100644 --- a/reinforcement-learning/value-based/example-image-tetris.ipynb +++ b/reinforcement-learning/value-based/example-image-tetris.ipynb @@ -103,6 +103,8 @@ "| SHAPE | input shape of the network (batch, channels, height, width) |\n", "| DISCOUNT | discount rate for rewards |\n", "| GAMMA | discount rate for Q-learning |\n", + "| PUNISHMENT | punishment for losing |\n", + "| INCENTIVE | incentive for rewards |\n", "| EXPLORATION_RATE | initial exploration rate |\n", "| EXPLORATION_MIN | minimum exploration rate |\n", "| EXPLORATION_STEPS | number of games to decay exploration rate from `RATE` to `MIN` |\n", diff --git a/reinforcement-learning/value-based/mlx/agent_basic.py b/reinforcement-learning/value-based/mlx/agent_basic.py index 8bb9e46..aaa6cb0 100644 --- a/reinforcement-learning/value-based/mlx/agent_basic.py +++ b/reinforcement-learning/value-based/mlx/agent_basic.py @@ -209,7 +209,7 @@ def learn(self, network): mean = mx.mean(rewards) std = mx.sqrt(mx.sum((rewards - mean) ** 2)/rewards.shape[0]) - rewards = (rewards - mean) / (std + 1e-9) + rewards = (rewards - mean) / (std + 1e-7) # GRADIENT # ------------------------------------------------------------------------------------------ diff --git a/reinforcement-learning/value-based/mlx/agent_image.py b/reinforcement-learning/value-based/mlx/agent_image.py index 435d236..7427a6e 100644 --- a/reinforcement-learning/value-based/mlx/agent_image.py +++ b/reinforcement-learning/value-based/mlx/agent_image.py @@ -279,7 +279,7 @@ def learn(self, network): mean = mx.mean(rewards) std = mx.sqrt(mx.sum((rewards - mean) ** 2) / rewards.shape[0]) - rewards = (rewards - mean) / (std + 1e-9) + rewards = (rewards - mean) / (std + 1e-7) # GRADIENT # ------------------------------------------------------------------------------------------