Skip to content

Commit

Permalink
Updated wrt. Orion HPC.
Browse files Browse the repository at this point in the history
  • Loading branch information
hallvardnmbu committed Mar 5, 2024
1 parent e9ee821 commit 8d4c216
Show file tree
Hide file tree
Showing 13 changed files with 143 additions and 65 deletions.
14 changes: 7 additions & 7 deletions reinforcement-learning/orion-hpc/example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"cell_type": "code",
"outputs": [],
"source": [
"WEIGHTS = './weights/weights-[NUMBER]' # NB: without '.pth'"
"WEIGHTS = './weights/weights-5000' # NB: without '.pth'"
],
"metadata": {
"collapsed": false
Expand Down Expand Up @@ -46,16 +46,16 @@
"source": [
"network = {\n",
" \"input_channels\": 1, \"outputs\": 5,\n",
" \"channels\": [32, 64, 64],\n",
" \"kernels\": [5, 3, 3],\n",
" \"strides\": [3, 2, 1],\n",
" \"nodes\": [64]\n",
" \"channels\": [32, 32],\n",
" \"kernels\": [8, 5],\n",
" \"strides\": [4, 2],\n",
"}\n",
"optimizer = {\n",
" \"optimizer\": torch.optim.AdamW,\n",
" \"lr\": 0.001,\n",
" \"hyperparameters\": {}\n",
"}"
"}\n",
"reshape = (1, 1, 100, 100)"
],
"metadata": {
"collapsed": false
Expand All @@ -68,7 +68,7 @@
"outputs": [],
"source": [
"value_agent = VisionDeepQ(\n",
" network=network, optimizer=optimizer,\n",
" network=network, optimizer=optimizer, shape=reshape,\n",
")\n",
"\n",
"weights = torch.load(f'{WEIGHTS}.pth', map_location=torch.device('cpu'))\n",
Expand Down
30 changes: 27 additions & 3 deletions reinforcement-learning/orion-hpc/upload/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,20 @@ def __init__(self,
other : dict
Additional parameters.
exploration_rate : float, optional
Initial exploration rate.
exploration_min : float, optional
Minimum exploration rate.
exploration_steps : int, optional
Number of steps before `exploration_min` is reached.
punishment : float, optional
Punishment for losing a game.
E.g., `-10` reward for losing a game.
incentive : float, optional
Incentive scaling for rewards.
Boosts the rewards gained by a factor.
memory : int, optional
Number of recent games to keep in memory.
discount : float, optional
Discount factor for future rewards.
--> 0: only consider immediate rewards
Expand Down Expand Up @@ -141,12 +155,17 @@ def __init__(self,
# control through deep reinforcement learning" (2015).

self.parameter = {
"shape": shape,

"rate": other.get("exploration_rate", 0.9),
"min": other.get("exploration_min", 0.01),
"decay":
(other.get("exploration_rate", 0.9) - other.get("exploration_min", 0.01))
/ other.get("exploration_steps", 1500),

"punishment": other.get("punishment", -1),
"incentive": other.get("incentive", 1),

"discount": other.get("discount", 0.99),
"gamma": other.get("gamma", 0.95),

Expand Down Expand Up @@ -180,6 +199,9 @@ def forward(self, state):
state = state.to(self.device) / torch.tensor(255,
dtype=torch.float32, device=self.device)

state = torch.nn.functional.interpolate(state,
size=self.parameter["shape"][2:])

_output = torch.relu(self.layer_0(state))
for i in range(1, len(self._modules) - 1):
if i > self.parameter["convolutions"]:
Expand Down Expand Up @@ -258,9 +280,11 @@ def learn(self, network):

_reward = 0
for i in reversed(range(len(rewards))):
_reward = 0 if i in steps else _reward
_reward = _reward * self.parameter["discount"] + rewards[i]
_reward = self.parameter["punishment"] if i in steps else _reward
_reward = (_reward * self.parameter["discount"]
+ rewards[i] * self.parameter["incentive"])
rewards[i] = _reward

rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-7)).view(-1, 1)

# Q-LEARNING
Expand Down Expand Up @@ -305,7 +329,7 @@ def learn(self, network):
self.parameter["optimizer"].zero_grad()
loss.backward()

# # Clamping gradients as per the Google DeepMind paper.
# Clamping gradients as per the Google DeepMind paper.
for param in self.parameters():
param.grad.data.clamp_(-1, 1)

Expand Down
36 changes: 22 additions & 14 deletions reinforcement-learning/orion-hpc/upload/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@

import os
import re
import sys
import copy
import time
import signal
import logging

import torch
Expand Down Expand Up @@ -51,28 +53,32 @@

GAMES = 50000
SHAPE = (1, 1, 210, 160)
RESHAPE = (1, 1, 100, 100)

DISCOUNT = 0.98
DISCOUNT = 0.95
GAMMA = 0.99

EXPLORATION_RATE = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_STEPS = 20000
PUNISHMENT = -5
INCENTIVE = 1

MINIBATCH = 48
MINIBATCH = 64
TRAIN_EVERY = 5
START_TRAINING_AT = 2500

EXPLORATION_RATE = 0.88
EXPLORATION_MIN = 0.01
EXPLORATION_STEPS = 3000 // TRAIN_EVERY

REMEMBER_ALL = False
MEMORY = 250
RESET_Q_EVERY = 250
RESET_Q_EVERY = 150

NETWORK = {
"input_channels": 1, "outputs": 5,
"channels": [32, 64, 64],
"kernels": [5, 3, 3],
"strides": [3, 2, 1],
"nodes": [64]
"channels": [32, 32],
"kernels": [8, 5],
"strides": [4, 2],
# "nodes": []
}
OPTIMIZER = {
"optimizer": torch.optim.AdamW,
Expand All @@ -88,13 +94,15 @@
value_agent = VisionDeepQ(
network=NETWORK, optimizer=OPTIMIZER,

batch_size=MINIBATCH, shape=SHAPE,
batch_size=MINIBATCH, shape=RESHAPE,

other={
"discount": DISCOUNT, "gamma": GAMMA,

"memory": MEMORY,

"incentive": INCENTIVE, "punishment": PUNISHMENT,

"exploration_rate": EXPLORATION_RATE,
"exploration_steps": EXPLORATION_STEPS,
"exploration_min": EXPLORATION_MIN
Expand Down Expand Up @@ -136,7 +144,7 @@
# Misc
# --------------------------------------------------------------------------------------------------

CHECKPOINT = GAMES // 25
CHECKPOINT = GAMES // 50
METRICS = {
"steps": torch.zeros(GAMES),
"losses": torch.zeros(GAMES // TRAIN_EVERY),
Expand Down Expand Up @@ -194,8 +202,8 @@
if REMEMBER_ALL or REWARDS > 0:
logger.debug(" Memorizing game")
value_agent.memorize(state, STEPS)
logger.info(" %s > Rewards: %s Steps: %s Memory: %s %%",
game, int(REWARDS), STEPS, len(value_agent.memory["memory"]) * 100 / MEMORY)
logger.info(" %s > Rewards: %s Steps: %s Memory: %s",
game, int(REWARDS), STEPS, len(value_agent.memory["memory"]))
else:
logger.debug(" Not memorizing game")
value_agent.memory["game"].clear()
Expand Down
Binary file not shown.
2 changes: 1 addition & 1 deletion reinforcement-learning/policy-based/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def learn(self):
for i in reversed(range(len(rewards))):
_reward = _reward * self.discount + rewards[i]
rewards[i] = _reward
rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-9)
rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

# POLICY GRADIENT
# ------------------------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion reinforcement-learning/policy-based/mlx/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def learn(self):

mean = mx.mean(rewards)
std = mx.sqrt(mx.sum((rewards - mean) ** 2)/rewards.shape[0])
rewards = (rewards - mean) / (std + 1e-9)
rewards = (rewards - mean) / (std + 1e-7)

# POLICY GRADIENT
# ------------------------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion reinforcement-learning/value-based/agent_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def learn(self, network):
_reward = 0 if i in steps else _reward
_reward = _reward * self.discount + rewards[i]
rewards[i] = _reward
rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-9)).view(-1, 1)
rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-7)).view(-1, 1)

# Q-LEARNING
# ------------------------------------------------------------------------------------------
Expand Down
57 changes: 42 additions & 15 deletions reinforcement-learning/value-based/agent_elaborate.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,26 @@ def __init__(self,
other : dict
Additional parameters.
exploration_rate : float, optional
Initial exploration rate.
exploration_min : float, optional
Minimum exploration rate.
exploration_steps : int, optional
Number of steps before `exploration_min` is reached.
punishment : float, optional
Punishment for losing a game.
E.g., `-10` reward for losing a game.
incentive : float, optional
Incentive scaling for rewards.
Boosts the rewards gained by a factor.
memory : int, optional
Number of recent games to keep in memory.
discount : float, optional
Discount factor for future rewards.
--> 0: only consider immediate rewards
--> 1: consider all future rewards equally
gamma : float, optional
Discount factor for Q-learning.
"""
super().__init__()
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Expand All @@ -72,17 +88,22 @@ def __init__(self,
# Default discount factor is 0.99, as suggested by the Google DeepMind paper "Human-level
# control through deep reinforcement learning" (2015).

self.explore = {
self.parameter = {
"rate": other.get("exploration_rate", 0.9),
"decay": other.get("exploration_decay", 0.999),
"min": other.get("exploration_min", 0.01),
"decay":
(other.get("exploration_rate", 0.9) - other.get("exploration_min", 0.01))
/ other.get("exploration_steps", 1500),

"punishment": other.get("punishment", -10),
"incentive": other.get("incentive", 100),

"discount": other.get("discount", 0.99),
"gamma": other.get("gamma", 0.95),
}

self.optimizer = optimizer["optimizer"](self.parameters(), lr=optimizer["lr"],
"optimizer": optimizer["optimizer"](self.parameters(), lr=optimizer["lr"],
**optimizer.get("hyperparameters", {}))
}

self.batch_size = batch_size
self.memory = deque(maxlen=other.get("memory", 2500))
Expand Down Expand Up @@ -130,8 +151,7 @@ def action(self, state):
actions : torch.Tensor
Q-values for each action.
"""

if np.random.rand() < self.explore["rate"]:
if np.random.rand() < self.parameter["rate"]:
action = torch.tensor([np.random.choice(
range(next(reversed(self._modules.values())).out_features)
)], dtype=torch.long)
Expand Down Expand Up @@ -183,10 +203,12 @@ def learn(self, network):

_reward = 0
for i in reversed(range(len(rewards))):
_reward = 0 if i in steps else _reward
_reward = _reward * self.explore["discount"] + rewards[i]
_reward = self.parameter["punishment"] if i in steps else _reward
_reward = (_reward * self.parameter["discount"]
+ rewards[i] * self.parameter["incentive"])
rewards[i] = _reward
rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-9)).view(-1, 1)

rewards = ((rewards - rewards.mean()) / (rewards.std() + 1e-7)).view(-1, 1)

# Q-LEARNING
# ------------------------------------------------------------------------------------------
Expand All @@ -210,7 +232,7 @@ def learn(self, network):

with torch.no_grad():
optimal = (rewards +
self.explore["gamma"] * network(new_states).max(1).values.view(-1, 1))
self.parameter["gamma"] * network(new_states).max(1).values.view(-1, 1))

# As Google DeepMind suggests, the optimal Q-value is set to r if the game is over.
for step in steps:
Expand All @@ -219,17 +241,22 @@ def learn(self, network):
# BACKPROPAGATION
# ------------------------------------------------------------------------------------------

loss = torch.nn.functional.smooth_l1_loss(actual, optimal)
loss = torch.nn.functional.mse_loss(actual, optimal)

self.optimizer.zero_grad()
self.parameter["optimizer"].zero_grad()
loss.backward()
self.optimizer.step()

# Clamping gradients as per the Google DeepMind paper.
for param in self.parameters():
param.grad.data.clamp_(-1, 1)

self.parameter["optimizer"].step()

# EXPLORATION RATE DECAY
# ------------------------------------------------------------------------------------------

self.explore["rate"] = max(self.explore["decay"] * self.explore["rate"],
self.explore["min"])
self.parameter["rate"] = max(self.parameter["rate"] - self.parameter["decay"],
self.parameter["min"])

del states, actions, new_states, rewards, _reward, actual, optimal

Expand Down
Loading

0 comments on commit 8d4c216

Please sign in to comment.