Skip to content

Commit

Permalink
Merge pull request st-tech#59 from st-tech/calc-ground-truth-synthetic
Browse files Browse the repository at this point in the history
Add calc_ground_truth_policy_value to SyntheticBanditDataset
  • Loading branch information
usaito authored Feb 7, 2021
2 parents 01aef32 + 023c5b5 commit 0d39b77
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 4 deletions.
54 changes: 50 additions & 4 deletions obp/dataset/synthetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,8 @@ def obtain_batch_bandit_feedback(self, n_rounds: int) -> BanditFeedback:
action = np.array(
[
self.random_.choice(
np.arange(self.n_actions), p=behavior_policy_[i],
np.arange(self.n_actions),
p=behavior_policy_[i],
)
for i in np.arange(n_rounds)
]
Expand Down Expand Up @@ -248,9 +249,50 @@ def obtain_batch_bandit_feedback(self, n_rounds: int) -> BanditFeedback:
pscore=pscore,
)

def calc_ground_truth_policy_value(
self, expected_reward: np.ndarray, action_dist: np.ndarray
) -> float:
"""Calculate the policy value of given action distribution on the given expected_reward.
Parameters
-----------
expected_reward: array-like, shape (n_rounds, n_actions)
Expected reward given context (:math:`x`) and action (:math:`a`), i.e., :math:`q(x,a):=\\mathbb{E}[r|x,a]`.
This is often the expected_reward of the test set of logged bandit feedback data.
action_dist: array-like, shape (n_rounds, n_actions, len_list)
Action choice probabilities by the evaluation policy (can be deterministic), i.e., :math:`\\pi_e(a_t|x_t)`.
Returns
----------
policy_value: float
The policy value of the given action distribution on the given bandit feedback data.
"""
if not isinstance(expected_reward, np.ndarray):
raise ValueError("expected_reward must be ndarray")
if not isinstance(action_dist, np.ndarray):
raise ValueError("action_dist must be ndarray")
if action_dist.ndim != 3:
raise ValueError(
f"action_dist must be 3-dimensional, but is {action_dist.ndim}."
)
if expected_reward.shape[0] != action_dist.shape[0]:
raise ValueError(
"the size of axis 0 of expected_reward must be the same as that of action_dist"
)
if expected_reward.shape[1] != action_dist.shape[1]:
raise ValueError(
"the size of axis 1 of expected_reward must be the same as that of action_dist"
)

return np.average(expected_reward, weights=action_dist[:, :, 0], axis=1).mean()


def logistic_reward_function(
context: np.ndarray, action_context: np.ndarray, random_state: Optional[int] = None,
context: np.ndarray,
action_context: np.ndarray,
random_state: Optional[int] = None,
) -> np.ndarray:
"""Logistic mean reward function for synthetic bandit datasets.
Expand Down Expand Up @@ -289,7 +331,9 @@ def logistic_reward_function(


def linear_reward_function(
context: np.ndarray, action_context: np.ndarray, random_state: Optional[int] = None,
context: np.ndarray,
action_context: np.ndarray,
random_state: Optional[int] = None,
) -> np.ndarray:
"""Linear mean reward function for synthetic bandit datasets.
Expand Down Expand Up @@ -328,7 +372,9 @@ def linear_reward_function(


def linear_behavior_policy(
context: np.ndarray, action_context: np.ndarray, random_state: Optional[int] = None,
context: np.ndarray,
action_context: np.ndarray,
random_state: Optional[int] = None,
) -> np.ndarray:
"""Linear contextual behavior policy for synthetic bandit datasets.
Expand Down
66 changes: 66 additions & 0 deletions tests/dataset/test_synthetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,72 @@ def test_synthetic_obtain_batch_bandit_feedback():
)


# expected_reward, action_dist, description
invalid_input_of_calc_policy_value = [
(
np.ones((2, 3)),
np.ones((3, 3, 3)),
"the size of axis 0 of expected_reward must be the same as that of action_dist",
),
(
np.ones((2, 3)),
np.ones((2, 2, 3)),
"the size of axis 1 of expected_reward must be the same as that of action_dist",
),
("3", np.ones((2, 2, 3)), "expected_reward must be ndarray"),
(None, np.ones((2, 2, 3)), "expected_reward must be ndarray"),
(np.ones((2, 3)), np.ones((2, 3)), "action_dist must be 3-dimensional, but is 2."),
(np.ones((2, 3)), "3", "action_dist must be ndarray"),
(np.ones((2, 3)), None, "action_dist must be ndarray"),
]

valid_input_of_calc_policy_value = [
(
np.ones((2, 3)),
np.ones((2, 3, 1)),
"valid shape",
),
]


@pytest.mark.parametrize(
"expected_reward, action_dist, description",
invalid_input_of_calc_policy_value,
)
def test_synthetic_calc_policy_value_using_invalid_inputs(
expected_reward,
action_dist,
description,
):
n_actions = 10
dataset = SyntheticBanditDataset(n_actions=n_actions)

with pytest.raises(ValueError, match=f"{description}*"):
_ = dataset.calc_ground_truth_policy_value(
expected_reward=expected_reward, action_dist=action_dist
)


@pytest.mark.parametrize(
"expected_reward, action_dist, description",
valid_input_of_calc_policy_value,
)
def test_synthetic_calc_policy_value_using_valid_inputs(
expected_reward,
action_dist,
description,
):
n_actions = 10
dataset = SyntheticBanditDataset(n_actions=n_actions)

policy_value = dataset.calc_ground_truth_policy_value(
expected_reward=expected_reward, action_dist=action_dist
)
assert isinstance(
policy_value, float
), "Invalid response of calc_ground_truth_policy_value"


def test_synthetic_logistic_reward_function():
# context
with pytest.raises(ValueError):
Expand Down

0 comments on commit 0d39b77

Please sign in to comment.