Updated wrt. Orion HPC.

hallvardnmbu · Mar 5, 2024 · 8d4c216 · 8d4c216
1 parent e9ee821
commit 8d4c216
Show file tree

Hide file tree

Showing 13 changed files with 143 additions and 65 deletions.
diff --git a/reinforcement-learning/orion-hpc/example.ipynb b/reinforcement-learning/orion-hpc/example.ipynb
@@ -16,7 +16,7 @@
    "cell_type": "code",
    "outputs": [],
    "source": [
-    "WEIGHTS = './weights/weights-[NUMBER]'  # NB: without '.pth'"
+    "WEIGHTS = './weights/weights-5000'  # NB: without '.pth'"
    ],
    "metadata": {
     "collapsed": false
@@ -46,16 +46,16 @@
    "source": [
     "network = {\n",
     "    \"input_channels\": 1, \"outputs\": 5,\n",
-    "    \"channels\": [32, 64, 64],\n",
-    "    \"kernels\": [5, 3, 3],\n",
-    "    \"strides\": [3, 2, 1],\n",
-    "    \"nodes\": [64]\n",
+    "    \"channels\": [32, 32],\n",
+    "    \"kernels\": [8, 5],\n",
+    "    \"strides\": [4, 2],\n",
     "}\n",
     "optimizer = {\n",
     "    \"optimizer\": torch.optim.AdamW,\n",
     "    \"lr\": 0.001,\n",
     "    \"hyperparameters\": {}\n",
-    "}"
+    "}\n",
+    "reshape = (1, 1, 100, 100)"
    ],
    "metadata": {
     "collapsed": false
@@ -68,7 +68,7 @@
    "outputs": [],
    "source": [
     "value_agent = VisionDeepQ(\n",
-    "    network=network, optimizer=optimizer,\n",
+    "    network=network, optimizer=optimizer, shape=reshape,\n",
     ")\n",
     "\n",
     "weights = torch.load(f'{WEIGHTS}.pth', map_location=torch.device('cpu'))\n",

diff --git a/reinforcement-learning/orion-hpc/upload/agent.py b/reinforcement-learning/orion-hpc/upload/agent.py
@@ -57,6 +57,20 @@ def __init__(self,
         other : dict
             Additional parameters.
 
+            exploration_rate : float, optional
+                Initial exploration rate.
+            exploration_min : float, optional
+                Minimum exploration rate.
+            exploration_steps : int, optional
+                Number of steps before `exploration_min` is reached.
+            punishment : float, optional
+                Punishment for losing a game.
+                E.g., `-10` reward for losing a game.
+            incentive : float, optional
+                Incentive scaling for rewards.
+                Boosts the rewards gained by a factor.
+            memory : int, optional
+                Number of recent games to keep in memory.
             discount : float, optional
                 Discount factor for future rewards.
                 --> 0: only consider immediate rewards
@@ -141,12 +155,17 @@ def __init__(self,
         # control through deep reinforcement learning" (2015).
 
         self.parameter = {
+            "shape": shape,
+
             "rate": other.get("exploration_rate", 0.9),
             "min": other.get("exploration_min", 0.01),
             "decay":
                 (other.get("exploration_rate", 0.9) - other.get("exploration_min", 0.01))
                 / other.get("exploration_steps", 1500),
 
+            "punishment": other.get("punishment", -1),
+            "incentive": other.get("incentive", 1),
+
             "discount": other.get("discount", 0.99),
             "gamma": other.get("gamma", 0.95),
 
@@ -180,6 +199,9 @@ def forward(self, state):
         state = state.to(self.device) / torch.tensor(255,
                                                      dtype=torch.float32, device=self.device)
 
+        state = torch.nn.functional.interpolate(state, 
+                                                size=self.parameter["shape"][2:])
+
         _output = torch.relu(self.layer_0(state))
         for i in range(1, len(self._modules) - 1):
             if i > self.parameter["convolutions"]:
@@ -258,9 +280,11 @@ def learn(self, network):
 
         _reward = 0
         for i in reversed(range(len(rewards))):
-            _reward = 0 if i in steps else _reward
-            _reward = _reward * self.parameter["discount"] + rewards[i]
+            _reward = self.parameter["punishment"] if i in steps else _reward
+            _reward = (_reward * self.parameter["discount"]
+                       + rewards[i] * self.parameter["incentive"])
             rewards[i] = _reward
+
         rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-7)).view(-1, 1)
 
         # Q-LEARNING
@@ -305,7 +329,7 @@ def learn(self, network):
         self.parameter["optimizer"].zero_grad()
         loss.backward()
 
-        # # Clamping gradients as per the Google DeepMind paper.
+        # Clamping gradients as per the Google DeepMind paper.
         for param in self.parameters():
             param.grad.data.clamp_(-1, 1)
 

diff --git a/reinforcement-learning/orion-hpc/upload/train.py b/reinforcement-learning/orion-hpc/upload/train.py
@@ -6,8 +6,10 @@
 
 import os
 import re
+import sys
 import copy
 import time
+import signal
 import logging
 
 import torch
@@ -51,28 +53,32 @@
 
 GAMES = 50000
 SHAPE = (1, 1, 210, 160)
+RESHAPE = (1, 1, 100, 100)
 
-DISCOUNT = 0.98
+DISCOUNT = 0.95
 GAMMA = 0.99
 
-EXPLORATION_RATE = 1.0
-EXPLORATION_MIN = 0.01
-EXPLORATION_STEPS = 20000
+PUNISHMENT = -5
+INCENTIVE = 1
 
-MINIBATCH = 48
+MINIBATCH = 64
 TRAIN_EVERY = 5
 START_TRAINING_AT = 2500
 
+EXPLORATION_RATE = 0.88
+EXPLORATION_MIN = 0.01
+EXPLORATION_STEPS = 3000 // TRAIN_EVERY
+
 REMEMBER_ALL = False
 MEMORY = 250
-RESET_Q_EVERY = 250
+RESET_Q_EVERY = 150
 
 NETWORK = {
     "input_channels": 1, "outputs": 5,
-    "channels": [32, 64, 64],
-    "kernels": [5, 3, 3],
-    "strides": [3, 2, 1],
-    "nodes": [64]
+    "channels": [32, 32],
+    "kernels": [8, 5],
+    "strides": [4, 2],
+    # "nodes": []
 }
 OPTIMIZER = {
     "optimizer": torch.optim.AdamW,
@@ -88,13 +94,15 @@
 value_agent = VisionDeepQ(
     network=NETWORK, optimizer=OPTIMIZER,
 
-    batch_size=MINIBATCH, shape=SHAPE,
+    batch_size=MINIBATCH, shape=RESHAPE,
 
     other={
         "discount": DISCOUNT, "gamma": GAMMA,
 
         "memory": MEMORY,
 
+        "incentive": INCENTIVE, "punishment": PUNISHMENT,
+
         "exploration_rate": EXPLORATION_RATE,
         "exploration_steps": EXPLORATION_STEPS,
         "exploration_min": EXPLORATION_MIN
@@ -136,7 +144,7 @@
 # Misc
 # --------------------------------------------------------------------------------------------------
 
-CHECKPOINT = GAMES // 25
+CHECKPOINT = GAMES // 50
 METRICS = {
     "steps": torch.zeros(GAMES),
     "losses": torch.zeros(GAMES // TRAIN_EVERY),
@@ -194,8 +202,8 @@
     if REMEMBER_ALL or REWARDS > 0:
         logger.debug(" Memorizing game")
         value_agent.memorize(state, STEPS)
-        logger.info("  %s > Rewards: %s Steps: %s Memory: %s %%",
-                    game, int(REWARDS), STEPS, len(value_agent.memory["memory"]) * 100 / MEMORY)
+        logger.info("  %s > Rewards: %s Steps: %s Memory: %s",
+                    game, int(REWARDS), STEPS, len(value_agent.memory["memory"]))
     else:
         logger.debug(" Not memorizing game")
         value_agent.memory["game"].clear()

diff --git a/reinforcement-learning/orion-hpc/weights/weights-5000.pth b/reinforcement-learning/orion-hpc/weights/weights-5000.pth
diff --git a/reinforcement-learning/policy-based/agent.py b/reinforcement-learning/policy-based/agent.py
@@ -146,7 +146,7 @@ def learn(self):
         for i in reversed(range(len(rewards))):
             _reward = _reward * self.discount + rewards[i]
             rewards[i] = _reward
-        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-9)
+        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)
 
         # POLICY GRADIENT
         # ------------------------------------------------------------------------------------------

diff --git a/reinforcement-learning/policy-based/mlx/agent.py b/reinforcement-learning/policy-based/mlx/agent.py
@@ -171,7 +171,7 @@ def learn(self):
 
         mean = mx.mean(rewards)
         std = mx.sqrt(mx.sum((rewards - mean) ** 2)/rewards.shape[0])
-        rewards = (rewards - mean) / (std + 1e-9)
+        rewards = (rewards - mean) / (std + 1e-7)
 
         # POLICY GRADIENT
         # ------------------------------------------------------------------------------------------

diff --git a/reinforcement-learning/value-based/agent_basic.py b/reinforcement-learning/value-based/agent_basic.py
@@ -180,7 +180,7 @@ def learn(self, network):
             _reward = 0 if i in steps else _reward
             _reward = _reward * self.discount + rewards[i]
             rewards[i] = _reward
-        rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-9)).view(-1, 1)
+        rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-7)).view(-1, 1)
 
         # Q-LEARNING
         # ------------------------------------------------------------------------------------------

diff --git a/reinforcement-learning/value-based/agent_elaborate.py b/reinforcement-learning/value-based/agent_elaborate.py
@@ -49,10 +49,26 @@ def __init__(self,
         other : dict
             Additional parameters.
 
+            exploration_rate : float, optional
+                Initial exploration rate.
+            exploration_min : float, optional
+                Minimum exploration rate.
+            exploration_steps : int, optional
+                Number of steps before `exploration_min` is reached.
+            punishment : float, optional
+                Punishment for losing a game.
+                E.g., `-10` reward for losing a game.
+            incentive : float, optional
+                Incentive scaling for rewards.
+                Boosts the rewards gained by a factor.
+            memory : int, optional
+                Number of recent games to keep in memory.
             discount : float, optional
                 Discount factor for future rewards.
                 --> 0: only consider immediate rewards
                 --> 1: consider all future rewards equally
+            gamma : float, optional
+                Discount factor for Q-learning.
         """
         super().__init__()
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -72,17 +88,22 @@ def __init__(self,
         # Default discount factor is 0.99, as suggested by the Google DeepMind paper "Human-level
         # control through deep reinforcement learning" (2015).
 
-        self.explore = {
+        self.parameter = {
             "rate": other.get("exploration_rate", 0.9),
-            "decay": other.get("exploration_decay", 0.999),
             "min": other.get("exploration_min", 0.01),
+            "decay":
+                (other.get("exploration_rate", 0.9) - other.get("exploration_min", 0.01))
+                / other.get("exploration_steps", 1500),
+
+            "punishment": other.get("punishment", -10),
+            "incentive": other.get("incentive", 100),
 
             "discount": other.get("discount", 0.99),
             "gamma": other.get("gamma", 0.95),
-        }
 
-        self.optimizer = optimizer["optimizer"](self.parameters(), lr=optimizer["lr"],
+            "optimizer": optimizer["optimizer"](self.parameters(), lr=optimizer["lr"],
                                                 **optimizer.get("hyperparameters", {}))
+        }
 
         self.batch_size = batch_size
         self.memory = deque(maxlen=other.get("memory", 2500))
@@ -130,8 +151,7 @@ def action(self, state):
         actions : torch.Tensor
             Q-values for each action.
         """
-
-        if np.random.rand() < self.explore["rate"]:
+        if np.random.rand() < self.parameter["rate"]:
             action = torch.tensor([np.random.choice(
                 range(next(reversed(self._modules.values())).out_features)
             )], dtype=torch.long)
@@ -183,10 +203,12 @@ def learn(self, network):
 
         _reward = 0
         for i in reversed(range(len(rewards))):
-            _reward = 0 if i in steps else _reward
-            _reward = _reward * self.explore["discount"] + rewards[i]
+            _reward = self.parameter["punishment"] if i in steps else _reward
+            _reward = (_reward * self.parameter["discount"]
+                       + rewards[i] * self.parameter["incentive"])
             rewards[i] = _reward
-        rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-9)).view(-1, 1)
+
+        rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-7)).view(-1, 1)
 
         # Q-LEARNING
         # ------------------------------------------------------------------------------------------
@@ -210,7 +232,7 @@ def learn(self, network):
 
         with torch.no_grad():
             optimal = (rewards +
-                       self.explore["gamma"] * network(new_states).max(1).values.view(-1, 1))
+                       self.parameter["gamma"] * network(new_states).max(1).values.view(-1, 1))
 
         # As Google DeepMind suggests, the optimal Q-value is set to r if the game is over.
         for step in steps:
@@ -219,17 +241,22 @@ def learn(self, network):
         # BACKPROPAGATION
         # ------------------------------------------------------------------------------------------
 
-        loss = torch.nn.functional.smooth_l1_loss(actual, optimal)
+        loss = torch.nn.functional.mse_loss(actual, optimal)
 
-        self.optimizer.zero_grad()
+        self.parameter["optimizer"].zero_grad()
         loss.backward()
-        self.optimizer.step()
+
+        # Clamping gradients as per the Google DeepMind paper.
+        for param in self.parameters():
+            param.grad.data.clamp_(-1, 1)
+
+        self.parameter["optimizer"].step()
 
         # EXPLORATION RATE DECAY
         # ------------------------------------------------------------------------------------------
 
-        self.explore["rate"] = max(self.explore["decay"] * self.explore["rate"],
-                                   self.explore["min"])
+        self.parameter["rate"] = max(self.parameter["rate"] - self.parameter["decay"],
+                                     self.parameter["min"])
 
         del states, actions, new_states, rewards, _reward, actual, optimal