Merge pull request st-tech#59 from st-tech/calc-ground-truth-synthetic

Add calc_ground_truth_policy_value to SyntheticBanditDataset
zwcdp · Feb 7, 2021 · 0d39b77 · 0d39b77
2 parents 01aef32 + 023c5b5
commit 0d39b77
Show file tree

Hide file tree

Showing 2 changed files with 116 additions and 4 deletions.
diff --git a/obp/dataset/synthetic.py b/obp/dataset/synthetic.py
@@ -201,7 +201,8 @@ def obtain_batch_bandit_feedback(self, n_rounds: int) -> BanditFeedback:
             action = np.array(
                 [
                     self.random_.choice(
-                        np.arange(self.n_actions), p=behavior_policy_[i],
+                        np.arange(self.n_actions),
+                        p=behavior_policy_[i],
                     )
                     for i in np.arange(n_rounds)
                 ]
@@ -248,9 +249,50 @@ def obtain_batch_bandit_feedback(self, n_rounds: int) -> BanditFeedback:
             pscore=pscore,
         )
 
+    def calc_ground_truth_policy_value(
+        self, expected_reward: np.ndarray, action_dist: np.ndarray
+    ) -> float:
+        """Calculate the policy value of given action distribution on the given expected_reward.
+
+        Parameters
+        -----------
+        expected_reward: array-like, shape (n_rounds, n_actions)
+            Expected reward given context (:math:`x`) and action (:math:`a`), i.e., :math:`q(x,a):=\\mathbb{E}[r|x,a]`.
+            This is often the expected_reward of the test set of logged bandit feedback data.
+
+        action_dist: array-like, shape (n_rounds, n_actions, len_list)
+            Action choice probabilities by the evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
+
+        Returns
+        ----------
+        policy_value: float
+            The policy value of the given action distribution on the given bandit feedback data.
+
+        """
+        if not isinstance(expected_reward, np.ndarray):
+            raise ValueError("expected_reward must be ndarray")
+        if not isinstance(action_dist, np.ndarray):
+            raise ValueError("action_dist must be ndarray")
+        if action_dist.ndim != 3:
+            raise ValueError(
+                f"action_dist must be 3-dimensional, but is {action_dist.ndim}."
+            )
+        if expected_reward.shape[0] != action_dist.shape[0]:
+            raise ValueError(
+                "the size of axis 0 of expected_reward must be the same as that of action_dist"
+            )
+        if expected_reward.shape[1] != action_dist.shape[1]:
+            raise ValueError(
+                "the size of axis 1 of expected_reward must be the same as that of action_dist"
+            )
+
+        return np.average(expected_reward, weights=action_dist[:, :, 0], axis=1).mean()
+
 
 def logistic_reward_function(
-    context: np.ndarray, action_context: np.ndarray, random_state: Optional[int] = None,
+    context: np.ndarray,
+    action_context: np.ndarray,
+    random_state: Optional[int] = None,
 ) -> np.ndarray:
     """Logistic mean reward function for synthetic bandit datasets.
 
@@ -289,7 +331,9 @@ def logistic_reward_function(
 
 
 def linear_reward_function(
-    context: np.ndarray, action_context: np.ndarray, random_state: Optional[int] = None,
+    context: np.ndarray,
+    action_context: np.ndarray,
+    random_state: Optional[int] = None,
 ) -> np.ndarray:
     """Linear mean reward function for synthetic bandit datasets.
 
@@ -328,7 +372,9 @@ def linear_reward_function(
 
 
 def linear_behavior_policy(
-    context: np.ndarray, action_context: np.ndarray, random_state: Optional[int] = None,
+    context: np.ndarray,
+    action_context: np.ndarray,
+    random_state: Optional[int] = None,
 ) -> np.ndarray:
     """Linear contextual behavior policy for synthetic bandit datasets.
 

diff --git a/tests/dataset/test_synthetic.py b/tests/dataset/test_synthetic.py
@@ -90,6 +90,72 @@ def test_synthetic_obtain_batch_bandit_feedback():
     )
 
 
+# expected_reward, action_dist, description
+invalid_input_of_calc_policy_value = [
+    (
+        np.ones((2, 3)),
+        np.ones((3, 3, 3)),
+        "the size of axis 0 of expected_reward must be the same as that of action_dist",
+    ),
+    (
+        np.ones((2, 3)),
+        np.ones((2, 2, 3)),
+        "the size of axis 1 of expected_reward must be the same as that of action_dist",
+    ),
+    ("3", np.ones((2, 2, 3)), "expected_reward must be ndarray"),
+    (None, np.ones((2, 2, 3)), "expected_reward must be ndarray"),
+    (np.ones((2, 3)), np.ones((2, 3)), "action_dist must be 3-dimensional, but is 2."),
+    (np.ones((2, 3)), "3", "action_dist must be ndarray"),
+    (np.ones((2, 3)), None, "action_dist must be ndarray"),
+]
+
+valid_input_of_calc_policy_value = [
+    (
+        np.ones((2, 3)),
+        np.ones((2, 3, 1)),
+        "valid shape",
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "expected_reward, action_dist, description",
+    invalid_input_of_calc_policy_value,
+)
+def test_synthetic_calc_policy_value_using_invalid_inputs(
+    expected_reward,
+    action_dist,
+    description,
+):
+    n_actions = 10
+    dataset = SyntheticBanditDataset(n_actions=n_actions)
+
+    with pytest.raises(ValueError, match=f"{description}*"):
+        _ = dataset.calc_ground_truth_policy_value(
+            expected_reward=expected_reward, action_dist=action_dist
+        )
+
+
+@pytest.mark.parametrize(
+    "expected_reward, action_dist, description",
+    valid_input_of_calc_policy_value,
+)
+def test_synthetic_calc_policy_value_using_valid_inputs(
+    expected_reward,
+    action_dist,
+    description,
+):
+    n_actions = 10
+    dataset = SyntheticBanditDataset(n_actions=n_actions)
+
+    policy_value = dataset.calc_ground_truth_policy_value(
+        expected_reward=expected_reward, action_dist=action_dist
+    )
+    assert isinstance(
+        policy_value, float
+    ), "Invalid response of calc_ground_truth_policy_value"
+
+
 def test_synthetic_logistic_reward_function():
     # context
     with pytest.raises(ValueError):