Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
TimotheeMathieu committed Nov 6, 2023
2 parents d617a86 + e36f8f0 commit 3f32937
Show file tree
Hide file tree
Showing 13 changed files with 1,141 additions and 0 deletions.
4 changes: 4 additions & 0 deletions rlberry_scool/agents/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Interfaces
from .dynprog import ValueIterationAgent
from .mbqvi import MBQVIAgent
from .ucbvi import UCBVIAgent
1 change: 1 addition & 0 deletions rlberry_scool/agents/dynprog/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .value_iteration import ValueIterationAgent
272 changes: 272 additions & 0 deletions rlberry_scool/agents/dynprog/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
import numpy as np
from rlberry.utils.jit_setup import numba_jit


@numba_jit
def backward_induction(R, P, horizon, gamma=1.0, vmax=np.inf):
"""Backward induction to compute Q and V functions in the finite horizon
setting.
Parameters
----------
R : numpy.ndarray
array of shape (S, A) contaning the rewards, where S is the number
of states and A is the number of actions
P : numpy.ndarray
array of shape (S, A, S) such that P[s,a,ns] is the probability of
arriving at ns by taking action a in state s.
horizon : int
problem horizon
gamma : double, default: 1.0
discount factor
vmax : double, default: np.inf
maximum possible value in V
Returns
--------
tuple (Q, V) containing the Q and V functions, of shapes (horizon, S, A)
and (horizon, S), respectively.
"""
S, A = R.shape
V = np.zeros((horizon, S))
Q = np.zeros((horizon, S, A))
for hh in range(horizon - 1, -1, -1):
for ss in range(S):
max_q = -np.inf
for aa in range(A):
q_aa = R[ss, aa]
if hh < horizon - 1:
# not using .dot instead of loop to avoid scipy dependency
# (numba seems to require scipy for linear
# algebra operations in numpy)
for ns in range(S):
q_aa += gamma * P[ss, aa, ns] * V[hh + 1, ns]
if q_aa > max_q:
max_q = q_aa
Q[hh, ss, aa] = q_aa
V[hh, ss] = max_q
if V[hh, ss] > vmax:
V[hh, ss] = vmax
return Q, V


@numba_jit
def backward_induction_reward_sd(Q, V, R, P, gamma=1.0, vmax=np.inf):
"""
Backward induction to compute Q and V functions in
the finite horizon setting.
Assumes R is stage-dependent, but P is stage-independent.
Takes as input the arrays where to store Q and V.
Parameters
----------
Q: numpy.ndarray
array of shape (horizon, S, A) where to store the Q function
V: numpy.ndarray
array of shape (horizon, S) where to store the V function
R : numpy.ndarray
array of shape (horizon, S, A) contaning the rewards, where S is the number
of states and A is the number of actions
P : numpy.ndarray
array of shape (S, A, S) such that P[s,a,ns] is the probability of
arriving at ns by taking action a in state s.
horizon : int
problem horizon
gamma : double
discount factor, default = 1.0
vmax : double
maximum possible value in V
default = np.inf
"""
H, S, A = R.shape
horizon = H
for hh in range(horizon - 1, -1, -1):
for ss in range(S):
max_q = -np.inf
for aa in range(A):
q_aa = R[hh, ss, aa]
if hh < horizon - 1:
# not using .dot instead of loop to avoid scipy dependency
# (numba seems to require scipy for linear algebra
# operations in numpy)
for ns in range(S):
q_aa += gamma * P[ss, aa, ns] * V[hh + 1, ns]
if q_aa > max_q:
max_q = q_aa
Q[hh, ss, aa] = q_aa
V[hh, ss] = max_q
if V[hh, ss] > vmax:
V[hh, ss] = vmax


@numba_jit
def backward_induction_in_place(Q, V, R, P, horizon, gamma=1.0, vmax=np.inf):
"""
Backward induction to compute Q and V functions in
the finite horizon setting.
Takes as input the arrays where to store Q and V.
Parameters
----------
Q: numpy.ndarray
array of shape (horizon, S, A) where to store the Q function
V: numpy.ndarray
array of shape (horizon, S) where to store the V function
R : numpy.ndarray
array of shape (S, A) contaning the rewards, where S is the number
of states and A is the number of actions
P : numpy.ndarray
array of shape (S, A, S) such that P[s,a,ns] is the probability of
arriving at ns by taking action a in state s.
horizon : int
problem horizon
gamma : double
discount factor, default = 1.0
vmax : double
maximum possible value in V
default = np.inf
"""
S, A = R.shape
for hh in range(horizon - 1, -1, -1):
for ss in range(S):
max_q = -np.inf
for aa in range(A):
q_aa = R[ss, aa]
if hh < horizon - 1:
# not using .dot instead of loop to avoid scipy dependency
# (numba seems to require scipy for linear algebra
# operations in numpy)
for ns in range(S):
q_aa += gamma * P[ss, aa, ns] * V[hh + 1, ns]
if q_aa > max_q:
max_q = q_aa
Q[hh, ss, aa] = q_aa
V[hh, ss] = max_q
if V[hh, ss] > vmax:
V[hh, ss] = vmax


@numba_jit
def backward_induction_sd(Q, V, R, P, gamma=1.0, vmax=np.inf):
"""
In-place implementation of backward induction to compute Q and V functions
in the finite horizon setting.
Assumes R and P are stage-dependent.
Parameters
----------
Q: numpy.ndarray
array of shape (H, S, A) where to store the Q function
V: numpy.ndarray
array of shape (H, S) where to store the V function
R : numpy.ndarray
array of shape (H, S, A) contaning the rewards, where S is the number
of states and A is the number of actions
P : numpy.ndarray
array of shape (H, S, A, S) such that P[h, s, a, ns] is the probability of
arriving at ns by taking action a in state s at stage h.
gamma : double, default: 1.0
discount factor
vmax : double, default: np.inf
maximum possible value in V
"""
H, S, A = R.shape
for hh in range(H - 1, -1, -1):
for ss in range(S):
max_q = -np.inf
for aa in range(A):
q_aa = R[hh, ss, aa]
if hh < H - 1:
# not using .dot instead of loop to avoid scipy dependency
# (numba seems to require scipy for linear
# algebra operations in numpy)
for ns in range(S):
q_aa += gamma * P[hh, ss, aa, ns] * V[hh + 1, ns]
if q_aa > max_q:
max_q = q_aa
Q[hh, ss, aa] = q_aa
V[hh, ss] = max_q
# clip V
if V[hh, ss] > vmax:
V[hh, ss] = vmax


@numba_jit
def value_iteration(R, P, gamma, epsilon=1e-6):
"""
Value iteration for discounted problems.
Parameters
----------
R : numpy.ndarray
array of shape (S, A) contaning the rewards, where S is the number
of states and A is the number of actions
P : numpy.ndarray
array of shape (S, A, S) such that P[s,a,ns] is the probability of
arriving at ns by taking action a in state s.
gamma : double
discount factor
epsilon : double
precision
Returns
--------
tuple (Q, V, n_it) containing the epsilon-optimal Q and V functions,
of shapes (S, A) and (S,), respectively, and n_it, the number of iterations
"""
S, A = R.shape
Q = np.zeros((S, A))
Q_aux = np.full((S, A), np.inf)
n_it = 0
while np.abs(Q - Q_aux).max() > epsilon:
Q_aux = Q
Q = bellman_operator(Q, R, P, gamma)
n_it += 1
V = np.zeros(S)
# numba does not support np.max(Q, axis=1)
for ss in range(S):
V[ss] = Q[ss, :].max()
return Q, V, n_it


@numba_jit
def bellman_operator(Q, R, P, gamma):
"""
Bellman optimality operator for Q functions
Parameters
----------
Q : numpy.ndarray
array of shape (S, A) containing the Q function to which apply
the operator
R : numpy.ndarray
array of shape (S, A) contaning the rewards, where S is the number
of states and A is the number of actions
P : numpy.ndarray
array of shape (S, A, S) such that P[s,a,ns] is the probability of
arriving at ns by taking action a in state s.
gamma : double
discount factor
Returns
--------
TQ, array of shape (S, A) containing the result of the Bellman operator
applied to the input Q
"""
S, A = Q.shape
TQ = np.zeros((S, A))
V = np.zeros(S)
# numba does not support np.max(Q, axis=1)
for ss in range(S):
V[ss] = Q[ss, :].max()
#
for ss in range(S):
for aa in range(A):
TQ[ss, aa] = R[ss, aa]
for ns in range(S):
TQ[ss, aa] += gamma * P[ss, aa, ns] * V[ns]
return TQ
82 changes: 82 additions & 0 deletions rlberry_scool/agents/dynprog/value_iteration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from rlberry.agents.agent import AgentWithSimplePolicy
from rlberry_scool.agents.dynprog.utils import backward_induction, value_iteration
from rlberry.envs.finite.finite_mdp import FiniteMDP


class ValueIterationAgent(AgentWithSimplePolicy):
"""
Value iteration for enviroments of type FiniteMDP
(rlberry.envs.finite.finite_mdp.FiniteMDP)
Important: the discount gamma is also used if the problem is
finite horizon, but, in this case, gamma can be set to 1.0.
Parameters
-----------
env : rlberry.envs.finite.finite_mdp.FiniteMDP
Environment used to fit the agent.
gamma : double
Discount factor in [0, 1]
horizon : int
Horizon, if the problem is finite-horizon. if None, the discounted
problem is solved
default = None
epsilon : double
Precision of value iteration, only used in discounted problems
(when horizon is None).
"""

name = "ValueIteration"

def __init__(self, env, gamma=0.95, horizon=None, epsilon=1e-6, **kwargs):
AgentWithSimplePolicy.__init__(self, env, **kwargs)

# initialize base class
assert isinstance(
self.env, FiniteMDP
), "Value iteration requires a FiniteMDP model."
#

self.gamma = gamma # attribute gamma

self.horizon = horizon
self.epsilon = epsilon

# value functions
self.Q = None
self.V = None

def fit(self, budget=None, **kwargs):
"""
Run value iteration.
Parameters
----------
budget: None
Not used. Only defined for compatibility purpose with rlberry.
Changing `budget` value has no effect.
"""
del kwargs
info = {}
if self.horizon is None:
assert self.gamma < 1.0, "The discounted setting requires gamma < 1.0"
self.Q, self.V, n_it = value_iteration(
self.env.R, self.env.P, self.gamma, self.epsilon
)
info["n_iterations"] = n_it
info["precision"] = self.epsilon
else:
self.Q, self.V = backward_induction(
self.env.R, self.env.P, self.horizon, self.gamma
)
info["n_iterations"] = self.horizon
info["precision"] = 0.0
return info

def policy(self, observation):
state = observation
if self.horizon is None:
return self.Q[state, :].argmax()
else:
return self.Q[0, state, :].argmax()
1 change: 1 addition & 0 deletions rlberry_scool/agents/mbqvi/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .mbqvi import MBQVIAgent
Loading

0 comments on commit 3f32937

Please sign in to comment.