-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' of https://github.com/rlberry-py/rlberry-scool
- Loading branch information
Showing
13 changed files
with
1,141 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Interfaces | ||
from .dynprog import ValueIterationAgent | ||
from .mbqvi import MBQVIAgent | ||
from .ucbvi import UCBVIAgent |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .value_iteration import ValueIterationAgent |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,272 @@ | ||
import numpy as np | ||
from rlberry.utils.jit_setup import numba_jit | ||
|
||
|
||
@numba_jit | ||
def backward_induction(R, P, horizon, gamma=1.0, vmax=np.inf): | ||
"""Backward induction to compute Q and V functions in the finite horizon | ||
setting. | ||
Parameters | ||
---------- | ||
R : numpy.ndarray | ||
array of shape (S, A) contaning the rewards, where S is the number | ||
of states and A is the number of actions | ||
P : numpy.ndarray | ||
array of shape (S, A, S) such that P[s,a,ns] is the probability of | ||
arriving at ns by taking action a in state s. | ||
horizon : int | ||
problem horizon | ||
gamma : double, default: 1.0 | ||
discount factor | ||
vmax : double, default: np.inf | ||
maximum possible value in V | ||
Returns | ||
-------- | ||
tuple (Q, V) containing the Q and V functions, of shapes (horizon, S, A) | ||
and (horizon, S), respectively. | ||
""" | ||
S, A = R.shape | ||
V = np.zeros((horizon, S)) | ||
Q = np.zeros((horizon, S, A)) | ||
for hh in range(horizon - 1, -1, -1): | ||
for ss in range(S): | ||
max_q = -np.inf | ||
for aa in range(A): | ||
q_aa = R[ss, aa] | ||
if hh < horizon - 1: | ||
# not using .dot instead of loop to avoid scipy dependency | ||
# (numba seems to require scipy for linear | ||
# algebra operations in numpy) | ||
for ns in range(S): | ||
q_aa += gamma * P[ss, aa, ns] * V[hh + 1, ns] | ||
if q_aa > max_q: | ||
max_q = q_aa | ||
Q[hh, ss, aa] = q_aa | ||
V[hh, ss] = max_q | ||
if V[hh, ss] > vmax: | ||
V[hh, ss] = vmax | ||
return Q, V | ||
|
||
|
||
@numba_jit | ||
def backward_induction_reward_sd(Q, V, R, P, gamma=1.0, vmax=np.inf): | ||
""" | ||
Backward induction to compute Q and V functions in | ||
the finite horizon setting. | ||
Assumes R is stage-dependent, but P is stage-independent. | ||
Takes as input the arrays where to store Q and V. | ||
Parameters | ||
---------- | ||
Q: numpy.ndarray | ||
array of shape (horizon, S, A) where to store the Q function | ||
V: numpy.ndarray | ||
array of shape (horizon, S) where to store the V function | ||
R : numpy.ndarray | ||
array of shape (horizon, S, A) contaning the rewards, where S is the number | ||
of states and A is the number of actions | ||
P : numpy.ndarray | ||
array of shape (S, A, S) such that P[s,a,ns] is the probability of | ||
arriving at ns by taking action a in state s. | ||
horizon : int | ||
problem horizon | ||
gamma : double | ||
discount factor, default = 1.0 | ||
vmax : double | ||
maximum possible value in V | ||
default = np.inf | ||
""" | ||
H, S, A = R.shape | ||
horizon = H | ||
for hh in range(horizon - 1, -1, -1): | ||
for ss in range(S): | ||
max_q = -np.inf | ||
for aa in range(A): | ||
q_aa = R[hh, ss, aa] | ||
if hh < horizon - 1: | ||
# not using .dot instead of loop to avoid scipy dependency | ||
# (numba seems to require scipy for linear algebra | ||
# operations in numpy) | ||
for ns in range(S): | ||
q_aa += gamma * P[ss, aa, ns] * V[hh + 1, ns] | ||
if q_aa > max_q: | ||
max_q = q_aa | ||
Q[hh, ss, aa] = q_aa | ||
V[hh, ss] = max_q | ||
if V[hh, ss] > vmax: | ||
V[hh, ss] = vmax | ||
|
||
|
||
@numba_jit | ||
def backward_induction_in_place(Q, V, R, P, horizon, gamma=1.0, vmax=np.inf): | ||
""" | ||
Backward induction to compute Q and V functions in | ||
the finite horizon setting. | ||
Takes as input the arrays where to store Q and V. | ||
Parameters | ||
---------- | ||
Q: numpy.ndarray | ||
array of shape (horizon, S, A) where to store the Q function | ||
V: numpy.ndarray | ||
array of shape (horizon, S) where to store the V function | ||
R : numpy.ndarray | ||
array of shape (S, A) contaning the rewards, where S is the number | ||
of states and A is the number of actions | ||
P : numpy.ndarray | ||
array of shape (S, A, S) such that P[s,a,ns] is the probability of | ||
arriving at ns by taking action a in state s. | ||
horizon : int | ||
problem horizon | ||
gamma : double | ||
discount factor, default = 1.0 | ||
vmax : double | ||
maximum possible value in V | ||
default = np.inf | ||
""" | ||
S, A = R.shape | ||
for hh in range(horizon - 1, -1, -1): | ||
for ss in range(S): | ||
max_q = -np.inf | ||
for aa in range(A): | ||
q_aa = R[ss, aa] | ||
if hh < horizon - 1: | ||
# not using .dot instead of loop to avoid scipy dependency | ||
# (numba seems to require scipy for linear algebra | ||
# operations in numpy) | ||
for ns in range(S): | ||
q_aa += gamma * P[ss, aa, ns] * V[hh + 1, ns] | ||
if q_aa > max_q: | ||
max_q = q_aa | ||
Q[hh, ss, aa] = q_aa | ||
V[hh, ss] = max_q | ||
if V[hh, ss] > vmax: | ||
V[hh, ss] = vmax | ||
|
||
|
||
@numba_jit | ||
def backward_induction_sd(Q, V, R, P, gamma=1.0, vmax=np.inf): | ||
""" | ||
In-place implementation of backward induction to compute Q and V functions | ||
in the finite horizon setting. | ||
Assumes R and P are stage-dependent. | ||
Parameters | ||
---------- | ||
Q: numpy.ndarray | ||
array of shape (H, S, A) where to store the Q function | ||
V: numpy.ndarray | ||
array of shape (H, S) where to store the V function | ||
R : numpy.ndarray | ||
array of shape (H, S, A) contaning the rewards, where S is the number | ||
of states and A is the number of actions | ||
P : numpy.ndarray | ||
array of shape (H, S, A, S) such that P[h, s, a, ns] is the probability of | ||
arriving at ns by taking action a in state s at stage h. | ||
gamma : double, default: 1.0 | ||
discount factor | ||
vmax : double, default: np.inf | ||
maximum possible value in V | ||
""" | ||
H, S, A = R.shape | ||
for hh in range(H - 1, -1, -1): | ||
for ss in range(S): | ||
max_q = -np.inf | ||
for aa in range(A): | ||
q_aa = R[hh, ss, aa] | ||
if hh < H - 1: | ||
# not using .dot instead of loop to avoid scipy dependency | ||
# (numba seems to require scipy for linear | ||
# algebra operations in numpy) | ||
for ns in range(S): | ||
q_aa += gamma * P[hh, ss, aa, ns] * V[hh + 1, ns] | ||
if q_aa > max_q: | ||
max_q = q_aa | ||
Q[hh, ss, aa] = q_aa | ||
V[hh, ss] = max_q | ||
# clip V | ||
if V[hh, ss] > vmax: | ||
V[hh, ss] = vmax | ||
|
||
|
||
@numba_jit | ||
def value_iteration(R, P, gamma, epsilon=1e-6): | ||
""" | ||
Value iteration for discounted problems. | ||
Parameters | ||
---------- | ||
R : numpy.ndarray | ||
array of shape (S, A) contaning the rewards, where S is the number | ||
of states and A is the number of actions | ||
P : numpy.ndarray | ||
array of shape (S, A, S) such that P[s,a,ns] is the probability of | ||
arriving at ns by taking action a in state s. | ||
gamma : double | ||
discount factor | ||
epsilon : double | ||
precision | ||
Returns | ||
-------- | ||
tuple (Q, V, n_it) containing the epsilon-optimal Q and V functions, | ||
of shapes (S, A) and (S,), respectively, and n_it, the number of iterations | ||
""" | ||
S, A = R.shape | ||
Q = np.zeros((S, A)) | ||
Q_aux = np.full((S, A), np.inf) | ||
n_it = 0 | ||
while np.abs(Q - Q_aux).max() > epsilon: | ||
Q_aux = Q | ||
Q = bellman_operator(Q, R, P, gamma) | ||
n_it += 1 | ||
V = np.zeros(S) | ||
# numba does not support np.max(Q, axis=1) | ||
for ss in range(S): | ||
V[ss] = Q[ss, :].max() | ||
return Q, V, n_it | ||
|
||
|
||
@numba_jit | ||
def bellman_operator(Q, R, P, gamma): | ||
""" | ||
Bellman optimality operator for Q functions | ||
Parameters | ||
---------- | ||
Q : numpy.ndarray | ||
array of shape (S, A) containing the Q function to which apply | ||
the operator | ||
R : numpy.ndarray | ||
array of shape (S, A) contaning the rewards, where S is the number | ||
of states and A is the number of actions | ||
P : numpy.ndarray | ||
array of shape (S, A, S) such that P[s,a,ns] is the probability of | ||
arriving at ns by taking action a in state s. | ||
gamma : double | ||
discount factor | ||
Returns | ||
-------- | ||
TQ, array of shape (S, A) containing the result of the Bellman operator | ||
applied to the input Q | ||
""" | ||
S, A = Q.shape | ||
TQ = np.zeros((S, A)) | ||
V = np.zeros(S) | ||
# numba does not support np.max(Q, axis=1) | ||
for ss in range(S): | ||
V[ss] = Q[ss, :].max() | ||
# | ||
for ss in range(S): | ||
for aa in range(A): | ||
TQ[ss, aa] = R[ss, aa] | ||
for ns in range(S): | ||
TQ[ss, aa] += gamma * P[ss, aa, ns] * V[ns] | ||
return TQ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
from rlberry.agents.agent import AgentWithSimplePolicy | ||
from rlberry_scool.agents.dynprog.utils import backward_induction, value_iteration | ||
from rlberry.envs.finite.finite_mdp import FiniteMDP | ||
|
||
|
||
class ValueIterationAgent(AgentWithSimplePolicy): | ||
""" | ||
Value iteration for enviroments of type FiniteMDP | ||
(rlberry.envs.finite.finite_mdp.FiniteMDP) | ||
Important: the discount gamma is also used if the problem is | ||
finite horizon, but, in this case, gamma can be set to 1.0. | ||
Parameters | ||
----------- | ||
env : rlberry.envs.finite.finite_mdp.FiniteMDP | ||
Environment used to fit the agent. | ||
gamma : double | ||
Discount factor in [0, 1] | ||
horizon : int | ||
Horizon, if the problem is finite-horizon. if None, the discounted | ||
problem is solved | ||
default = None | ||
epsilon : double | ||
Precision of value iteration, only used in discounted problems | ||
(when horizon is None). | ||
""" | ||
|
||
name = "ValueIteration" | ||
|
||
def __init__(self, env, gamma=0.95, horizon=None, epsilon=1e-6, **kwargs): | ||
AgentWithSimplePolicy.__init__(self, env, **kwargs) | ||
|
||
# initialize base class | ||
assert isinstance( | ||
self.env, FiniteMDP | ||
), "Value iteration requires a FiniteMDP model." | ||
# | ||
|
||
self.gamma = gamma # attribute gamma | ||
|
||
self.horizon = horizon | ||
self.epsilon = epsilon | ||
|
||
# value functions | ||
self.Q = None | ||
self.V = None | ||
|
||
def fit(self, budget=None, **kwargs): | ||
""" | ||
Run value iteration. | ||
Parameters | ||
---------- | ||
budget: None | ||
Not used. Only defined for compatibility purpose with rlberry. | ||
Changing `budget` value has no effect. | ||
""" | ||
del kwargs | ||
info = {} | ||
if self.horizon is None: | ||
assert self.gamma < 1.0, "The discounted setting requires gamma < 1.0" | ||
self.Q, self.V, n_it = value_iteration( | ||
self.env.R, self.env.P, self.gamma, self.epsilon | ||
) | ||
info["n_iterations"] = n_it | ||
info["precision"] = self.epsilon | ||
else: | ||
self.Q, self.V = backward_induction( | ||
self.env.R, self.env.P, self.horizon, self.gamma | ||
) | ||
info["n_iterations"] = self.horizon | ||
info["precision"] = 0.0 | ||
return info | ||
|
||
def policy(self, observation): | ||
state = observation | ||
if self.horizon is None: | ||
return self.Q[state, :].argmax() | ||
else: | ||
return self.Q[0, state, :].argmax() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .mbqvi import MBQVIAgent |
Oops, something went wrong.