use epsilon to compute log probs for deterministic pi

airboxlab · May 28, 2024 · ff83101 · ff83101
1 parent 5e71ab6
commit ff83101
Show file tree

Hide file tree

Showing 4 changed files with 78 additions and 18 deletions.
diff --git a/hopes/policy/policies.py b/hopes/policy/policies.py
@@ -10,7 +10,7 @@
 from sklearn.metrics import accuracy_score, f1_score
 
 from hopes.dev_utils import override
-from hopes.policy.utils import bin_actions, deterministic_log_probs
+from hopes.policy.utils import bin_actions, log_probs_for_deterministic_policy
 
 
 class Policy(ABC):
@@ -26,6 +26,10 @@ class Policy(ABC):
     def name(self):
         return self._name or self.__class__.__name__
 
+    @property
+    def epsilon(self):
+        return self._epsilon
+
     def with_name(self, name: str) -> "Policy":
         """Set the name of the policy. This is optional but can be useful for logging,
         visualization and comparison with other policies.
@@ -236,9 +240,10 @@ class PiecewiseLinearPolicy(Policy):
     temperature and is bounded by a minimum and maximum on both axis. This can also be
     helpful to model a simple schedule, where action is a function of time.
 
-    Since the output of a piecewise linear model is deterministic, the log-probabilities are
-    computed by assuming the function is deterministic and assigning a probability of 1 to
-    the action returned by the function and an almost zero probability to all other actions.
+    Since the output of the piecewise linear model is deterministic, the log-probabilities
+    are computed by assuming the function is deterministic and assigning a probability of ~1
+    to the action returned by the function and an almost zero probability to all other
+    actions.
 
     Also, the piecewise linear policy output being continuous, we need to discretize the
     action space to compute the log-probabilities. This is done by binning the actions to
@@ -250,12 +255,15 @@ def __init__(
         num_segments: int,
         obs: np.ndarray,
         act: np.ndarray,
+        epsilon: float,
         actions_bins: list[float | int] | None = None,
     ):
         """
         :param num_segments: the number of segments for the piecewise linear model.
         :param obs: the observations for training the piecewise linear model, shape: (batch_size, obs_dim).
         :param act: the actions for training the piecewise linear model, shape: (batch_size,).
+        :param epsilon: the epsilon value for epsilon-greedy action selection. This is mandatory for computing
+            log-probabilities since the policy is deterministic.
         :param actions_bins: the bins for discretizing the action space. If not provided, we assume the action space
             is already discretized.
         """
@@ -264,12 +272,15 @@ def __init__(
             len(obs.shape) == 1 or obs.shape[1] == 1
         ), "Piecewise linear policy only supports 1D observations."
         assert obs.shape[0] == act.shape[0], "Number of observations and actions must match."
+        assert epsilon is not None, "Epsilon must be set for piecewise linear policy."
 
         self.num_segments = num_segments
         self.model_obs = obs.squeeze() if obs.ndim == 2 else obs
         self.model_act = act.squeeze() if act.ndim == 2 else act
         self.model = None
 
+        self._epsilon = epsilon
+
         # bins used to discretize the action space
         self.actions_bins = actions_bins if actions_bins else np.unique(self.model_act)
 
@@ -304,14 +315,18 @@ def fit(self) -> dict[str, float]:
     def log_probabilities(self, obs: np.ndarray) -> np.ndarray:
         """Compute the log-probabilities of the actions under the piecewise linear policy for a
         given set of observations."""
+        assert self.epsilon is not None, (
+            "Epsilon must be set for piecewise linear policy (using with_epsilon method)."
+            "This is used for log-probability computation."
+        )
         if obs.ndim == 1:
             raw_actions = self.model.predict(obs)
         else:
             raw_actions = np.array([self.model.predict(o) for o in obs])
         # bin the action to the nearest action using the discretized action space
         actions = bin_actions(raw_actions, self.actions_bins)
         # return the log-probabilities
-        return deterministic_log_probs(actions, self.actions_bins)
+        return log_probs_for_deterministic_policy(actions, self.actions_bins, self.epsilon)
 
 
 class FunctionBasedPolicy(Policy):
@@ -322,25 +337,39 @@ class FunctionBasedPolicy(Policy):
     to all other actions. The action space is discretized to compute the log-probabilities.
     """
 
-    def __init__(self, policy_function: callable, actions_bins: list[float | int]) -> None:
+    def __init__(
+        self, policy_function: callable, epsilon: float, actions_bins: list[float | int]
+    ) -> None:
         """
         :param policy_function: a function that takes in observations and returns actions.
+        :param epsilon: the epsilon value for epsilon-greedy action selection.
+            This is mandatory for computing log-probabilities since the policy is deterministic.
         :param actions_bins: the bins for discretizing the action space.
         """
-        assert callable(policy_function), "Policy function must be callable."
         assert len(actions_bins) > 0, "Action bins must be non-empty."
+        assert np.all(np.diff(actions_bins) > 0), "Action bins must be in increasing order."
+        assert np.all(np.isin(actions_bins, np.unique(actions_bins))), "Action bins must be unique."
+
+        assert policy_function is not None, "Policy function must be set."
+        assert callable(policy_function), "Policy function must be callable."
+
         self.policy_function = policy_function
         self.actions_bins = np.array(actions_bins)
+        self._epsilon = epsilon
 
     @override(Policy)
     def log_probabilities(self, obs: np.ndarray) -> np.ndarray:
         """Compute the log-probabilities of the actions under the function-based policy for a given
         set of observations."""
+        assert self.epsilon is not None, (
+            "Epsilon must be set for function-based policy (using with_epsilon method)."
+            "This is used for log-probability computation."
+        )
         raw_actions = np.vectorize(self.policy_function)(obs)
         # bin the action to the nearest action using the discretized action space
         actions = bin_actions(raw_actions, self.actions_bins)
         # return the log-probabilities
-        return deterministic_log_probs(actions, self.actions_bins)
+        return log_probs_for_deterministic_policy(actions, self.actions_bins, self.epsilon)
 
 
 class HttpPolicy(Policy):

diff --git a/hopes/policy/utils.py b/hopes/policy/utils.py
@@ -1,17 +1,29 @@
 import numpy as np
 
 
-def deterministic_log_probs(
-    actions: np.ndarray, actions_bins: np.ndarray, lamb: float = 1e-6
+def log_probs_for_deterministic_policy(
+    actions: np.ndarray, actions_bins: np.ndarray, epsilon: float = 1e-6
 ) -> np.ndarray:
-    """Compute the log probabilities of a given set of actions, assuming a deterministic policy.
+    """Compute the log probabilities of a given set of actions, assuming a given deterministic
+    policy.
 
     We assign a probability of ~1 to the action returned by the function and an almost zero
     probability to all other actions (note: sum of log probs must be 1)
+
+    :param actions: the actions for which to compute the log probabilities.
+    :param actions_bins: the set of possible actions.
+    :param epsilon: the small value to use for the probabilities of the other actions.
     """
-    top = np.log(1.0 - (lamb * (len(actions_bins) - 1)))
-    others = np.log(lamb)
-    return np.array([[top if a == action else others for a in actions_bins] for action in actions])
+    assert np.all(np.isin(actions, actions_bins)), "Some actions are not in the action bins."
+    unlikely_p = epsilon / len(actions_bins)
+    return np.log(
+        np.array(
+            [
+                [(1.0 - epsilon) + unlikely_p if a == action else unlikely_p for a in actions_bins]
+                for action in actions
+            ]
+        )
+    )
 
 
 def bin_actions(actions: np.ndarray, bins: np.ndarray) -> np.ndarray:

diff --git a/tests/test_estimators.py b/tests/test_estimators.py
@@ -3,7 +3,16 @@
 import numpy as np
 from action_probs_utils import generate_action_probs
 
-from hopes.ope.estimators import *
+from hopes.ope.estimators import (
+    BaseEstimator,
+    DirectMethod,
+    InverseProbabilityWeighting,
+    PerDecisionImportanceSampling,
+    SelfNormalizedInverseProbabilityWeighting,
+    SelfNormalizedPerDecisionImportanceSampling,
+    SelfNormalizedTrajectoryWiseImportanceSampling,
+    TrajectoryWiseImportanceSampling,
+)
 from hopes.rew.rewards import RegressionBasedRewardModel
 
 

diff --git a/tests/test_policies.py b/tests/test_policies.py
@@ -13,7 +13,7 @@
     PiecewiseLinearPolicy,
     RandomPolicy,
 )
-from hopes.policy.utils import piecewise_linear
+from hopes.policy.utils import log_probs_for_deterministic_policy, piecewise_linear
 from tests.action_probs_utils import generate_action_probs
 from tests.utils import assert_act_probs, assert_log_probs
 
@@ -68,7 +68,9 @@ def test_piecewise_linear_policy(self):
         bins = list(range(15, 31))
 
         # create and fit a piecewise linear policy
-        reg_policy = PiecewiseLinearPolicy(num_segments=3, obs=obs, act=act, actions_bins=bins)
+        reg_policy = PiecewiseLinearPolicy(
+            num_segments=3, obs=obs, act=act, actions_bins=bins, epsilon=0.01
+        )
         fit_stats = reg_policy.fit()
         self.assertIsInstance(fit_stats, dict)
         self.assertIn("rmse", fit_stats)
@@ -92,7 +94,9 @@ def pi(_obs):
             return piecewise_linear(_obs, y0=30, y1=15, left_cp=10, right_cp=20, slope=-0.5)
 
         # create and fit a piecewise linear policy
-        reg_policy = FunctionBasedPolicy(policy_function=pi, actions_bins=[15, 20, 25, 30])
+        reg_policy = FunctionBasedPolicy(
+            policy_function=pi, actions_bins=[15, 20, 25, 30], epsilon=0.01
+        )
 
         # check if the policy returns valid log-probs
         obs = np.random.randint(-10, 30, 100).reshape(-1, 1)
@@ -176,3 +180,9 @@ def test_select_action_rnd_determ_eps(self):
         class_pol.with_epsilon(0.5)
         actions = [class_pol.select_action(obs=obs, deterministic=False) for _ in range(100)]
         self.assertTrue(np.var(actions) > 0)
+
+    def test_log_probs_for_deterministic_policy(self):
+        actions = np.array([0, 1, 2, 0, 1])
+        actions_bins = np.array([0, 1, 2])
+        log_probs = log_probs_for_deterministic_policy(actions, actions_bins)
+        assert_log_probs(log_probs, expected_shape=(5, 3))