Skip to content

Commit

Permalink
alternate filling for missing values in build_anti_testset (NicolasHu…
Browse files Browse the repository at this point in the history
…g#100)

* alternate filling for missing values

* dataset.py

* better doc and a test

* renamed test to match tested method name

* minor text fix
  • Loading branch information
lgalke authored and NicolasHug committed Oct 20, 2017
1 parent 9d2723b commit 9cb442e
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 5 deletions.
19 changes: 14 additions & 5 deletions surprise/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,26 +670,35 @@ def build_testset(self):
return [(self.to_raw_uid(u), self.to_raw_iid(i), r)
for (u, i, r) in self.all_ratings()]

def build_anti_testset(self):
def build_anti_testset(self, fill=None):
"""Return a list of ratings that can be used as a testset in the
:meth:`test() <surprise.prediction_algorithms.algo_base.AlgoBase.test>`
method.
The ratings are all the ratings that are **not** in the trainset, i.e.
all the ratings :math:`r_{ui}` where the user :math:`u` is known, the
item :math:`i` is known, but the rating :math:`r_{ui}` is not in the
trainset. As :math:`r_{ui}` is unknown, it is assumed to be equal to
the mean of all ratings :meth:`global_mean
<surprise.dataset.Trainset.global_mean>`.
trainset. As :math:`r_{ui}` is unknown, it is either replaced by the
:code:`fill` value or assumed to be equal to the mean of all ratings
:meth:`global_mean <surprise.dataset.Trainset.global_mean>`.
Args:
fill(float): The value to fill unknown ratings. If :code:`None` the
global mean of all ratings :meth:`global_mean
<surprise.dataset.Trainset.global_mean>` will be used.
Returns:
A list of tuples ``(uid, iid, fill)`` where ids are raw ids.
"""
fill = self.global_mean if fill is None else float(fill)

anti_testset = []
for u in self.all_users():
for i in self.all_items():
user_items = [j for (j, _) in self.ur[u]]
if i not in user_items:
r_ui = (self.to_raw_uid(u), self.to_raw_iid(i),
self.global_mean)
fill)
anti_testset.append(r_ui)
return anti_testset

Expand Down
23 changes: 23 additions & 0 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,3 +197,26 @@ def test_load_form_df():
trainset = data.build_full_trainset()
with pytest.raises(ValueError):
trainset.to_inner_uid('10000')


def test_build_anti_testset():
ratings_dict = {'itemID': [1, 2, 3, 4, 5, 6, 7, 8, 9],
'userID': [1, 2, 3, 4, 5, 6, 7, 8, 9],
'rating': [1, 2, 3, 4, 5, 6, 7, 8, 9]}
df = pd.DataFrame(ratings_dict)

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
data.split(2)
trainset, __testset = next(data.folds())
# fill with some specific value
for fillvalue in (0, 42., -1):
anti = trainset.build_anti_testset(fill=fillvalue)
for (u, i, r) in anti:
assert r == fillvalue
# fill with global_mean
anti = trainset.build_anti_testset(fill=None)
for (u, i, r) in anti:
assert r == trainset.global_mean
expect = trainset.n_users * trainset.n_items
assert trainset.n_ratings + len(anti) == expect

0 comments on commit 9cb442e

Please sign in to comment.