Skip to content

Commit

Permalink
use epsilon to compute log probs for deterministic pi
Browse files Browse the repository at this point in the history
  • Loading branch information
antoine-galataud committed May 28, 2024
1 parent 5e71ab6 commit ff83101
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 18 deletions.
45 changes: 37 additions & 8 deletions hopes/policy/policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from sklearn.metrics import accuracy_score, f1_score

from hopes.dev_utils import override
from hopes.policy.utils import bin_actions, deterministic_log_probs
from hopes.policy.utils import bin_actions, log_probs_for_deterministic_policy


class Policy(ABC):
Expand All @@ -26,6 +26,10 @@ class Policy(ABC):
def name(self):
return self._name or self.__class__.__name__

@property
def epsilon(self):
return self._epsilon

def with_name(self, name: str) -> "Policy":
"""Set the name of the policy. This is optional but can be useful for logging,
visualization and comparison with other policies.
Expand Down Expand Up @@ -236,9 +240,10 @@ class PiecewiseLinearPolicy(Policy):
temperature and is bounded by a minimum and maximum on both axis. This can also be
helpful to model a simple schedule, where action is a function of time.
Since the output of a piecewise linear model is deterministic, the log-probabilities are
computed by assuming the function is deterministic and assigning a probability of 1 to
the action returned by the function and an almost zero probability to all other actions.
Since the output of the piecewise linear model is deterministic, the log-probabilities
are computed by assuming the function is deterministic and assigning a probability of ~1
to the action returned by the function and an almost zero probability to all other
actions.
Also, the piecewise linear policy output being continuous, we need to discretize the
action space to compute the log-probabilities. This is done by binning the actions to
Expand All @@ -250,12 +255,15 @@ def __init__(
num_segments: int,
obs: np.ndarray,
act: np.ndarray,
epsilon: float,
actions_bins: list[float | int] | None = None,
):
"""
:param num_segments: the number of segments for the piecewise linear model.
:param obs: the observations for training the piecewise linear model, shape: (batch_size, obs_dim).
:param act: the actions for training the piecewise linear model, shape: (batch_size,).
:param epsilon: the epsilon value for epsilon-greedy action selection. This is mandatory for computing
log-probabilities since the policy is deterministic.
:param actions_bins: the bins for discretizing the action space. If not provided, we assume the action space
is already discretized.
"""
Expand All @@ -264,12 +272,15 @@ def __init__(
len(obs.shape) == 1 or obs.shape[1] == 1
), "Piecewise linear policy only supports 1D observations."
assert obs.shape[0] == act.shape[0], "Number of observations and actions must match."
assert epsilon is not None, "Epsilon must be set for piecewise linear policy."

self.num_segments = num_segments
self.model_obs = obs.squeeze() if obs.ndim == 2 else obs
self.model_act = act.squeeze() if act.ndim == 2 else act
self.model = None

self._epsilon = epsilon

# bins used to discretize the action space
self.actions_bins = actions_bins if actions_bins else np.unique(self.model_act)

Expand Down Expand Up @@ -304,14 +315,18 @@ def fit(self) -> dict[str, float]:
def log_probabilities(self, obs: np.ndarray) -> np.ndarray:
"""Compute the log-probabilities of the actions under the piecewise linear policy for a
given set of observations."""
assert self.epsilon is not None, (
"Epsilon must be set for piecewise linear policy (using with_epsilon method)."
"This is used for log-probability computation."
)
if obs.ndim == 1:
raw_actions = self.model.predict(obs)
else:
raw_actions = np.array([self.model.predict(o) for o in obs])
# bin the action to the nearest action using the discretized action space
actions = bin_actions(raw_actions, self.actions_bins)
# return the log-probabilities
return deterministic_log_probs(actions, self.actions_bins)
return log_probs_for_deterministic_policy(actions, self.actions_bins, self.epsilon)


class FunctionBasedPolicy(Policy):
Expand All @@ -322,25 +337,39 @@ class FunctionBasedPolicy(Policy):
to all other actions. The action space is discretized to compute the log-probabilities.
"""

def __init__(self, policy_function: callable, actions_bins: list[float | int]) -> None:
def __init__(
self, policy_function: callable, epsilon: float, actions_bins: list[float | int]
) -> None:
"""
:param policy_function: a function that takes in observations and returns actions.
:param epsilon: the epsilon value for epsilon-greedy action selection.
This is mandatory for computing log-probabilities since the policy is deterministic.
:param actions_bins: the bins for discretizing the action space.
"""
assert callable(policy_function), "Policy function must be callable."
assert len(actions_bins) > 0, "Action bins must be non-empty."
assert np.all(np.diff(actions_bins) > 0), "Action bins must be in increasing order."
assert np.all(np.isin(actions_bins, np.unique(actions_bins))), "Action bins must be unique."

assert policy_function is not None, "Policy function must be set."
assert callable(policy_function), "Policy function must be callable."

self.policy_function = policy_function
self.actions_bins = np.array(actions_bins)
self._epsilon = epsilon

@override(Policy)
def log_probabilities(self, obs: np.ndarray) -> np.ndarray:
"""Compute the log-probabilities of the actions under the function-based policy for a given
set of observations."""
assert self.epsilon is not None, (
"Epsilon must be set for function-based policy (using with_epsilon method)."
"This is used for log-probability computation."
)
raw_actions = np.vectorize(self.policy_function)(obs)
# bin the action to the nearest action using the discretized action space
actions = bin_actions(raw_actions, self.actions_bins)
# return the log-probabilities
return deterministic_log_probs(actions, self.actions_bins)
return log_probs_for_deterministic_policy(actions, self.actions_bins, self.epsilon)


class HttpPolicy(Policy):
Expand Down
24 changes: 18 additions & 6 deletions hopes/policy/utils.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,29 @@
import numpy as np


def deterministic_log_probs(
actions: np.ndarray, actions_bins: np.ndarray, lamb: float = 1e-6
def log_probs_for_deterministic_policy(
actions: np.ndarray, actions_bins: np.ndarray, epsilon: float = 1e-6
) -> np.ndarray:
"""Compute the log probabilities of a given set of actions, assuming a deterministic policy.
"""Compute the log probabilities of a given set of actions, assuming a given deterministic
policy.
We assign a probability of ~1 to the action returned by the function and an almost zero
probability to all other actions (note: sum of log probs must be 1)
:param actions: the actions for which to compute the log probabilities.
:param actions_bins: the set of possible actions.
:param epsilon: the small value to use for the probabilities of the other actions.
"""
top = np.log(1.0 - (lamb * (len(actions_bins) - 1)))
others = np.log(lamb)
return np.array([[top if a == action else others for a in actions_bins] for action in actions])
assert np.all(np.isin(actions, actions_bins)), "Some actions are not in the action bins."
unlikely_p = epsilon / len(actions_bins)
return np.log(
np.array(
[
[(1.0 - epsilon) + unlikely_p if a == action else unlikely_p for a in actions_bins]
for action in actions
]
)
)


def bin_actions(actions: np.ndarray, bins: np.ndarray) -> np.ndarray:
Expand Down
11 changes: 10 additions & 1 deletion tests/test_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,16 @@
import numpy as np
from action_probs_utils import generate_action_probs

from hopes.ope.estimators import *
from hopes.ope.estimators import (
BaseEstimator,
DirectMethod,
InverseProbabilityWeighting,
PerDecisionImportanceSampling,
SelfNormalizedInverseProbabilityWeighting,
SelfNormalizedPerDecisionImportanceSampling,
SelfNormalizedTrajectoryWiseImportanceSampling,
TrajectoryWiseImportanceSampling,
)
from hopes.rew.rewards import RegressionBasedRewardModel


Expand Down
16 changes: 13 additions & 3 deletions tests/test_policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
PiecewiseLinearPolicy,
RandomPolicy,
)
from hopes.policy.utils import piecewise_linear
from hopes.policy.utils import log_probs_for_deterministic_policy, piecewise_linear
from tests.action_probs_utils import generate_action_probs
from tests.utils import assert_act_probs, assert_log_probs

Expand Down Expand Up @@ -68,7 +68,9 @@ def test_piecewise_linear_policy(self):
bins = list(range(15, 31))

# create and fit a piecewise linear policy
reg_policy = PiecewiseLinearPolicy(num_segments=3, obs=obs, act=act, actions_bins=bins)
reg_policy = PiecewiseLinearPolicy(
num_segments=3, obs=obs, act=act, actions_bins=bins, epsilon=0.01
)
fit_stats = reg_policy.fit()
self.assertIsInstance(fit_stats, dict)
self.assertIn("rmse", fit_stats)
Expand All @@ -92,7 +94,9 @@ def pi(_obs):
return piecewise_linear(_obs, y0=30, y1=15, left_cp=10, right_cp=20, slope=-0.5)

# create and fit a piecewise linear policy
reg_policy = FunctionBasedPolicy(policy_function=pi, actions_bins=[15, 20, 25, 30])
reg_policy = FunctionBasedPolicy(
policy_function=pi, actions_bins=[15, 20, 25, 30], epsilon=0.01
)

# check if the policy returns valid log-probs
obs = np.random.randint(-10, 30, 100).reshape(-1, 1)
Expand Down Expand Up @@ -176,3 +180,9 @@ def test_select_action_rnd_determ_eps(self):
class_pol.with_epsilon(0.5)
actions = [class_pol.select_action(obs=obs, deterministic=False) for _ in range(100)]
self.assertTrue(np.var(actions) > 0)

def test_log_probs_for_deterministic_policy(self):
actions = np.array([0, 1, 2, 0, 1])
actions_bins = np.array([0, 1, 2])
log_probs = log_probs_for_deterministic_policy(actions, actions_bins)
assert_log_probs(log_probs, expected_shape=(5, 3))

0 comments on commit ff83101

Please sign in to comment.