Skip to content

Commit

Permalink
Use handle_unknown=ignore in SuperVectorizer (skrub-data#473)
Browse files Browse the repository at this point in the history
* Use handle_unknown=ignore in SuperVectorizer

Change default `low_card_cat_transformer` in SuperVectorizer to use handle_unknown="ignore"

* Update changelog

* Change drop to None

* Fix bug for new categories for categorical columns

Pandas `category` dtype conversion converts new categories to nans, so we now update the list of categories before converting.

* Fix test to prevent n_samples < n_components

* Update dirty_cat/_super_vectorizer.py

Co-authored-by: Jovan Stojanovic <[email protected]>

* Convert all categorical columns to object dtype inside SuperVectorizer

This avoids dealing with the categories attached to the dtype.

* Put back drop="if_binary"

And use handle_unknown="error" for sklearn < 0.24.2.

* Revert "Convert all categorical columns to object dtype inside SuperVectorizer"

This reverts commit 34ed05f.

* finish merge

* change name in CHANGES.rst

* Change min version for handle_unknown=ignore to 1.0.0

and change the warning message to be more informative.

* warning stacklevel + fix name

* replace sup_vec by table_vec

---------

Co-authored-by: Jovan Stojanovic <[email protected]>
  • Loading branch information
LeoGrin and jovan-stojanovic authored Feb 16, 2023
1 parent 90ea4db commit 9b15dd2
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 27 deletions.
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ Minor changes
which can be used to specify where to save and load from datasets.
:pr:`432` by :user:`Lilian Boulard <LilianBoulard>`

* The :class:`TableVectorizer`'s default `OneHotEncoder` for low cardinality categorical variables now defaults
to `handle_unknown="ignore"` instead of `handle_unknown="error"` (for sklearn >= 1.0.0).
This means that categories seen only at test time will be encoded by a vector of zeroes instead of raising an error. :pr:`473` by :user:`Leo Grinsztajn <LeoGrin>`

Bug fixes
---------

Expand Down
35 changes: 32 additions & 3 deletions dirty_cat/_table_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ class TableVectorizer(ColumnTransformer):
'remainder' for applying `remainder`,
'passthrough' to return the unencoded columns,
or `None` to use the default transformer
(:class:`~sklearn.preprocessing.OneHotEncoder(drop="if_binary")`).
(:class:`~sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore", drop="if_binary")`).
Features classified under this category are imputed based on the
strategy defined with `impute_missing`.
Expand Down Expand Up @@ -327,7 +327,27 @@ def _clone_transformers(self):
if isinstance(self.low_card_cat_transformer, sklearn.base.TransformerMixin):
self.low_card_cat_transformer_ = clone(self.low_card_cat_transformer)
elif self.low_card_cat_transformer is None:
self.low_card_cat_transformer_ = OneHotEncoder(drop="if_binary")
if parse_version(sklearn_version) >= parse_version("1.0.0"):
# sklearn is lenient and let us use both handle_unknown="ignore"
# and drop="if_binary" at the same time
self.low_card_cat_transformer_ = OneHotEncoder(
drop="if_binary", handle_unknown="ignore"
) # TODO maybe change to "infrequent_if_exists" if we bump sklearn min version to 1.1
else:
# sklearn is not lenient, and does not let us use both handle_unknown="ignore"
# and drop="if_binary" at the same time
# so we use handle_unknown="error" instead
self.low_card_cat_transformer_ = OneHotEncoder(
drop="if_binary", handle_unknown="error"
)
warn(
"You are using an old version of scikit-learn. "
"Using handle_unknown='error' in low_card_cat_transformer. "
"Please upgrade to scikit-learn 1.0.0 or higher to "
"use handle_unknown='ignore', or change the drop parameter to"
" None.",
stacklevel=2, # display the warning at the level of the user's code (fit_transform method)
)
elif self.low_card_cat_transformer == "remainder":
self.low_card_cat_transformer_ = self.remainder
else:
Expand Down Expand Up @@ -437,7 +457,16 @@ def _apply_cast(self, X: pd.DataFrame) -> pd.DataFrame:
for col in self.imputed_columns_:
X[col] = _replace_missing_in_cat_col(X[col])
for col, dtype in self.types_.items():
X[col] = X[col].astype(dtype)
# if categorical, add the new categories to prevent
# them to be encoded as nan
if pd.api.types.is_categorical_dtype(dtype):
known_categories = dtype.categories
new_categories = pd.unique(X[col])
dtype = pd.CategoricalDtype(
categories=known_categories.union(new_categories)
)
self.types_[col] = dtype
X.loc[:, col] = X[col].astype(dtype)
return X

def fit_transform(self, X, y=None):
Expand Down
72 changes: 65 additions & 7 deletions dirty_cat/tests/test_table_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,23 +371,23 @@ def test_fit() -> None:
# Simply checks sklearn's `check_is_fitted` function raises an error if
# the TableVectorizer is instantiated but not fitted.
# See GH#193
sup_vec = TableVectorizer()
table_vec = TableVectorizer()
with pytest.raises(NotFittedError):
assert check_is_fitted(sup_vec)
assert check_is_fitted(table_vec)


def test_transform() -> None:
X = _get_clean_dataframe()
sup_vec = TableVectorizer()
sup_vec.fit(X)
table_vec = TableVectorizer()
table_vec.fit(X)
s = [34, 5.5, "private", "manager", "yes", "60K+"]
x = np.array(s).reshape(1, -1)
x_trans = sup_vec.transform(x)
x_trans = table_vec.transform(x)
assert x_trans.tolist() == [
[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 34.0, 5.5]
]
# To understand the list above:
# print(dict(zip(sup_vec.get_feature_names_out(), x_trans.tolist()[0])))
# print(dict(zip(table_vec.get_feature_names_out(), x_trans.tolist()[0])))


def test_fit_transform_equiv() -> None:
Expand Down Expand Up @@ -437,7 +437,8 @@ def test_passthrough():
X_enc_clean = pd.DataFrame(
tv.fit_transform(X_clean), columns=tv.get_feature_names_out()
)
# Reorder encoded arrays' columns (see TableVectorizer's doc "Notes" section as to why)
# Reorder encoded arrays' columns
# (see TableVectorizer's doc "Notes" section as to why)
X_enc_dirty = X_enc_dirty[X_dirty.columns]
X_enc_clean = X_enc_clean[X_clean.columns]

Expand All @@ -463,3 +464,60 @@ def test_check_name_change():
"""Test that using SuperVectorizer raises a deprecation warning"""
with pytest.warns(FutureWarning):
SuperVectorizer()


def test_handle_unknown():
"""
Test that new categories encountered in the test set
are handled correctly.
"""
X = _get_clean_dataframe()
# Test with low cardinality and a StandardScaler for the numeric columns
table_vec = TableVectorizer(
cardinality_threshold=6, # treat all columns as low cardinality
)
table_vec.fit(X)
x_unknown = pd.DataFrame(
{
"int": pd.Series([3, 1], dtype="int"),
"float": pd.Series([2.1, 4.3], dtype="float"),
"str1": pd.Series(["semi-private", "public"], dtype="string"),
"str2": pd.Series(["researcher", "chef"], dtype="string"),
"cat1": pd.Series(["maybe", "yes"], dtype="category"),
"cat2": pd.Series(["70K+", "20K+"], dtype="category"),
}
)
x_known = pd.DataFrame(
{
"int": pd.Series([1, 4], dtype="int"),
"float": pd.Series([4.3, 3.3], dtype="float"),
"str1": pd.Series(["public", "private"], dtype="string"),
"str2": pd.Series(["chef", "chef"], dtype="string"),
"cat1": pd.Series(["yes", "no"], dtype="category"),
"cat2": pd.Series(["30K+", "20K+"], dtype="category"),
}
)
if parse_version(sklearn.__version__) >= parse_version("1.0.0"):
# Default behavior is "handle_unknown='ignore'",
# so unknown categories are encoded as all zeros
x_trans_unknown = table_vec.transform(x_unknown)
x_trans_known = table_vec.transform(x_known)

assert x_trans_unknown.shape == x_trans_known.shape
n_zeroes = (
X["str2"].nunique() + X["cat2"].nunique() + 2
) # 2 for binary columns which get one
# cateogry dropped
assert np.allclose(
x_trans_unknown[0, :n_zeroes], np.zeros_like(x_trans_unknown[0, :n_zeroes])
)
assert x_trans_unknown[0, n_zeroes] != 0
assert not np.allclose(
x_trans_known[0, :n_zeroes], np.zeros_like(x_trans_known[0, :n_zeroes])
)
else:
# Default behavior is "handle_unknown='error'",
# so unknown categories raise an error
with pytest.raises(ValueError, match="Found unknown categories"):
table_vec.transform(x_unknown)
table_vec.transform(x_known)
10 changes: 5 additions & 5 deletions examples/01_dirty_categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@
#
# Let us perform the same workflow, but without the |Pipeline|, so we can
# analyze the TableVectorizer's mechanisms along the way.
sup_vec = TableVectorizer(auto_cast=True)
table_vec = TableVectorizer(auto_cast=True)

# %%
# We split the data between train and test, and transform them:
Expand All @@ -302,8 +302,8 @@
X, y, test_size=0.15, random_state=42
)

X_train_enc = sup_vec.fit_transform(X_train, y_train)
X_test_enc = sup_vec.transform(X_test)
X_train_enc = table_vec.fit_transform(X_train, y_train)
X_test_enc = table_vec.transform(X_test)

###############################################################################
# The encoded data, X_train_enc and X_test_enc are numerical arrays:
Expand All @@ -321,7 +321,7 @@
# choice:
from pprint import pprint

pprint(sup_vec.transformers_)
pprint(table_vec.transformers_)

###############################################################################
# This is what is being passed to the |ColumnTransformer| under the hood.
Expand All @@ -341,7 +341,7 @@

###############################################################################
# After encoding (we only plot the first 8 feature names):
feature_names = sup_vec.get_feature_names_out()
feature_names = table_vec.get_feature_names_out()
feature_names[:8]

###############################################################################
Expand Down
24 changes: 12 additions & 12 deletions examples/03_datetime_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,22 +86,22 @@
from dirty_cat import TableVectorizer
from pprint import pprint

sup_vec = TableVectorizer()
sup_vec.fit_transform(X)
pprint(sup_vec.get_feature_names_out())
table_vec = TableVectorizer()
table_vec.fit_transform(X)
pprint(table_vec.get_feature_names_out())

###############################################################################
# If we want the day of the week, we can just replace |TV|'s default parameter:
sup_vec = TableVectorizer(
table_vec = TableVectorizer(
datetime_transformer=DatetimeEncoder(add_day_of_the_week=True),
)
sup_vec.fit_transform(X)
sup_vec.get_feature_names_out()
table_vec.fit_transform(X)
table_vec.get_feature_names_out()

###############################################################################
# We can see that the |TV| is indeed using
# a |DtE| for the datetime features.
pprint(sup_vec.transformers_)
pprint(table_vec.transformers_)

###############################################################################
# Predictions with date features
Expand All @@ -113,11 +113,11 @@
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import make_pipeline

sup_vec = TableVectorizer(
table_vec = TableVectorizer(
datetime_transformer=DatetimeEncoder(add_day_of_the_week=True),
)
reg = HistGradientBoostingRegressor()
pipeline = make_pipeline(sup_vec, reg)
pipeline = make_pipeline(table_vec, reg)

###############################################################################
# Evaluating the model
Expand Down Expand Up @@ -226,13 +226,13 @@
###############################################################################
from sklearn.inspection import permutation_importance

sup_vec = TableVectorizer(
table_vec = TableVectorizer(
datetime_transformer=DatetimeEncoder(add_day_of_the_week=True),
)

# In this case, we don't use a pipeline, because we want to compute the
# importance of the features created by the DatetimeEncoder
X_ = sup_vec.fit_transform(X)
X_ = table_vec.fit_transform(X)
reg = HistGradientBoostingRegressor().fit(X_, y)
result = permutation_importance(reg, X_, y, n_repeats=10, random_state=0)
std = result.importances_std
Expand All @@ -244,7 +244,7 @@
plt.figure(figsize=(12, 9))
plt.title("Feature importances")
n = len(indices)
labels = np.array(sup_vec.get_feature_names_out())[indices]
labels = np.array(table_vec.get_feature_names_out())[indices]
plt.barh(range(n), importances[indices], color="b", yerr=std[indices])
plt.yticks(range(n), labels, size=15)
plt.tight_layout(pad=1)
Expand Down

0 comments on commit 9b15dd2

Please sign in to comment.