Use handle_unknown=ignore in SuperVectorizer (skrub-data#473)

* Use handle_unknown=ignore in SuperVectorizer Change default `low_card_cat_transformer` in SuperVectorizer to use handle_unknown="ignore" * Update changelog * Change drop to None * Fix bug for new categories for categorical columns Pandas `category` dtype conversion converts new categories to nans, so we now update the list of categories before converting. * Fix test to prevent n_samples < n_components * Update dirty_cat/_super_vectorizer.py Co-authored-by: Jovan Stojanovic <[email protected]> * Convert all categorical columns to object dtype inside SuperVectorizer This avoids dealing with the categories attached to the dtype. * Put back drop="if_binary" And use handle_unknown="error" for sklearn < 0.24.2. * Revert "Convert all categorical columns to object dtype inside SuperVectorizer" This reverts commit 34ed05f. * finish merge * change name in CHANGES.rst * Change min version for handle_unknown=ignore to 1.0.0 and change the warning message to be more informative. * warning stacklevel + fix name * replace sup_vec by table_vec --------- Co-authored-by: Jovan Stojanovic <[email protected]>
fcas · Feb 16, 2023 · 9b15dd2 · 9b15dd2
1 parent 90ea4db
commit 9b15dd2
Show file tree

Hide file tree

Showing 5 changed files with 118 additions and 27 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -40,6 +40,10 @@ Minor changes
  which can be used to specify where to save and load from datasets.
  :pr:`432` by :user:`Lilian Boulard <LilianBoulard>`
 
+* The :class:`TableVectorizer`'s default `OneHotEncoder` for low cardinality categorical variables now defaults
+ to `handle_unknown="ignore"` instead of `handle_unknown="error"` (for sklearn >= 1.0.0).
+ This means that categories seen only at test time will be encoded by a vector of zeroes instead of raising an error. :pr:`473` by :user:`Leo Grinsztajn <LeoGrin>`
+
 Bug fixes
 ---------
 

diff --git a/dirty_cat/_table_vectorizer.py b/dirty_cat/_table_vectorizer.py
@@ -120,7 +120,7 @@ class TableVectorizer(ColumnTransformer):
  'remainder' for applying `remainder`,
  'passthrough' to return the unencoded columns,
  or `None` to use the default transformer
- (:class:`~sklearn.preprocessing.OneHotEncoder(drop="if_binary")`).
+ (:class:`~sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore", drop="if_binary")`).
  Features classified under this category are imputed based on the
  strategy defined with `impute_missing`.
 
@@ -327,7 +327,27 @@ def _clone_transformers(self):
  if isinstance(self.low_card_cat_transformer, sklearn.base.TransformerMixin):
  self.low_card_cat_transformer_ = clone(self.low_card_cat_transformer)
  elif self.low_card_cat_transformer is None:
- self.low_card_cat_transformer_ = OneHotEncoder(drop="if_binary")
+ if parse_version(sklearn_version) >= parse_version("1.0.0"):
+ # sklearn is lenient and let us use both handle_unknown="ignore"
+ # and drop="if_binary" at the same time
+ self.low_card_cat_transformer_ = OneHotEncoder(
+ drop="if_binary", handle_unknown="ignore"
+ ) # TODO maybe change to "infrequent_if_exists" if we bump sklearn min version to 1.1
+ else:
+ # sklearn is not lenient, and does not let us use both handle_unknown="ignore"
+ # and drop="if_binary" at the same time
+ # so we use handle_unknown="error" instead
+ self.low_card_cat_transformer_ = OneHotEncoder(
+ drop="if_binary", handle_unknown="error"
+ )
+ warn(
+ "You are using an old version of scikit-learn. "
+ "Using handle_unknown='error' in low_card_cat_transformer. "
+ "Please upgrade to scikit-learn 1.0.0 or higher to "
+ "use handle_unknown='ignore', or change the drop parameter to"
+ " None.",
+ stacklevel=2, # display the warning at the level of the user's code (fit_transform method)
+ )
  elif self.low_card_cat_transformer == "remainder":
  self.low_card_cat_transformer_ = self.remainder
  else:
@@ -437,7 +457,16 @@ def _apply_cast(self, X: pd.DataFrame) -> pd.DataFrame:
  for col in self.imputed_columns_:
  X[col] = _replace_missing_in_cat_col(X[col])
  for col, dtype in self.types_.items():
- X[col] = X[col].astype(dtype)
+ # if categorical, add the new categories to prevent
+ # them to be encoded as nan
+ if pd.api.types.is_categorical_dtype(dtype):
+ known_categories = dtype.categories
+ new_categories = pd.unique(X[col])
+ dtype = pd.CategoricalDtype(
+ categories=known_categories.union(new_categories)
+ )
+ self.types_[col] = dtype
+ X.loc[:, col] = X[col].astype(dtype)
  return X
 
  def fit_transform(self, X, y=None):

diff --git a/dirty_cat/tests/test_table_vectorizer.py b/dirty_cat/tests/test_table_vectorizer.py
@@ -371,23 +371,23 @@ def test_fit() -> None:
  # Simply checks sklearn's `check_is_fitted` function raises an error if
  # the TableVectorizer is instantiated but not fitted.
  # See GH#193
- sup_vec = TableVectorizer()
+ table_vec = TableVectorizer()
  with pytest.raises(NotFittedError):
- assert check_is_fitted(sup_vec)
+ assert check_is_fitted(table_vec)
 
 
 def test_transform() -> None:
  X = _get_clean_dataframe()
- sup_vec = TableVectorizer()
- sup_vec.fit(X)
+ table_vec = TableVectorizer()
+ table_vec.fit(X)
  s = [34, 5.5, "private", "manager", "yes", "60K+"]
  x = np.array(s).reshape(1, -1)
- x_trans = sup_vec.transform(x)
+ x_trans = table_vec.transform(x)
  assert x_trans.tolist() == [
  [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 34.0, 5.5]
  ]
  # To understand the list above:
- # print(dict(zip(sup_vec.get_feature_names_out(), x_trans.tolist()[0])))
+ # print(dict(zip(table_vec.get_feature_names_out(), x_trans.tolist()[0])))
 
 
 def test_fit_transform_equiv() -> None:
@@ -437,7 +437,8 @@ def test_passthrough():
  X_enc_clean = pd.DataFrame(
  tv.fit_transform(X_clean), columns=tv.get_feature_names_out()
  )
- # Reorder encoded arrays' columns (see TableVectorizer's doc "Notes" section as to why)
+ # Reorder encoded arrays' columns
+ # (see TableVectorizer's doc "Notes" section as to why)
  X_enc_dirty = X_enc_dirty[X_dirty.columns]
  X_enc_clean = X_enc_clean[X_clean.columns]
 
@@ -463,3 +464,60 @@ def test_check_name_change():
  """Test that using SuperVectorizer raises a deprecation warning"""
  with pytest.warns(FutureWarning):
  SuperVectorizer()
+
+
+def test_handle_unknown():
+ """
+ Test that new categories encountered in the test set
+ are handled correctly.
+ """
+ X = _get_clean_dataframe()
+ # Test with low cardinality and a StandardScaler for the numeric columns
+ table_vec = TableVectorizer(
+ cardinality_threshold=6, # treat all columns as low cardinality
+ )
+ table_vec.fit(X)
+ x_unknown = pd.DataFrame(
+ {
+ "int": pd.Series([3, 1], dtype="int"),
+ "float": pd.Series([2.1, 4.3], dtype="float"),
+ "str1": pd.Series(["semi-private", "public"], dtype="string"),
+ "str2": pd.Series(["researcher", "chef"], dtype="string"),
+ "cat1": pd.Series(["maybe", "yes"], dtype="category"),
+ "cat2": pd.Series(["70K+", "20K+"], dtype="category"),
+ }
+ )
+ x_known = pd.DataFrame(
+ {
+ "int": pd.Series([1, 4], dtype="int"),
+ "float": pd.Series([4.3, 3.3], dtype="float"),
+ "str1": pd.Series(["public", "private"], dtype="string"),
+ "str2": pd.Series(["chef", "chef"], dtype="string"),
+ "cat1": pd.Series(["yes", "no"], dtype="category"),
+ "cat2": pd.Series(["30K+", "20K+"], dtype="category"),
+ }
+ )
+ if parse_version(sklearn.__version__) >= parse_version("1.0.0"):
+ # Default behavior is "handle_unknown='ignore'",
+ # so unknown categories are encoded as all zeros
+ x_trans_unknown = table_vec.transform(x_unknown)
+ x_trans_known = table_vec.transform(x_known)
+
+ assert x_trans_unknown.shape == x_trans_known.shape
+ n_zeroes = (
+ X["str2"].nunique() + X["cat2"].nunique() + 2
+ ) # 2 for binary columns which get one
+ # cateogry dropped
+ assert np.allclose(
+ x_trans_unknown[0, :n_zeroes], np.zeros_like(x_trans_unknown[0, :n_zeroes])
+ )
+ assert x_trans_unknown[0, n_zeroes] != 0
+ assert not np.allclose(
+ x_trans_known[0, :n_zeroes], np.zeros_like(x_trans_known[0, :n_zeroes])
+ )
+ else:
+ # Default behavior is "handle_unknown='error'",
+ # so unknown categories raise an error
+ with pytest.raises(ValueError, match="Found unknown categories"):
+ table_vec.transform(x_unknown)
+ table_vec.transform(x_known)
diff --git a/examples/01_dirty_categories.py b/examples/01_dirty_categories.py
@@ -292,7 +292,7 @@
 #
 # Let us perform the same workflow, but without the |Pipeline|, so we can
 # analyze the TableVectorizer's mechanisms along the way.
-sup_vec = TableVectorizer(auto_cast=True)
+table_vec = TableVectorizer(auto_cast=True)
 
 # %%
 # We split the data between train and test, and transform them:
@@ -302,8 +302,8 @@
  X, y, test_size=0.15, random_state=42
 )
 
-X_train_enc = sup_vec.fit_transform(X_train, y_train)
-X_test_enc = sup_vec.transform(X_test)
+X_train_enc = table_vec.fit_transform(X_train, y_train)
+X_test_enc = table_vec.transform(X_test)
 
 ###############################################################################
 # The encoded data, X_train_enc and X_test_enc are numerical arrays:
@@ -321,7 +321,7 @@
 # choice:
 from pprint import pprint
 
-pprint(sup_vec.transformers_)
+pprint(table_vec.transformers_)
 
 ###############################################################################
 # This is what is being passed to the |ColumnTransformer| under the hood.
@@ -341,7 +341,7 @@
 
 ###############################################################################
 # After encoding (we only plot the first 8 feature names):
-feature_names = sup_vec.get_feature_names_out()
+feature_names = table_vec.get_feature_names_out()
 feature_names[:8]
 
 ###############################################################################

diff --git a/examples/03_datetime_encoder.py b/examples/03_datetime_encoder.py
@@ -86,22 +86,22 @@
 from dirty_cat import TableVectorizer
 from pprint import pprint
 
-sup_vec = TableVectorizer()
-sup_vec.fit_transform(X)
-pprint(sup_vec.get_feature_names_out())
+table_vec = TableVectorizer()
+table_vec.fit_transform(X)
+pprint(table_vec.get_feature_names_out())
 
 ###############################################################################
 # If we want the day of the week, we can just replace |TV|'s default parameter:
-sup_vec = TableVectorizer(
+table_vec = TableVectorizer(
  datetime_transformer=DatetimeEncoder(add_day_of_the_week=True),
 )
-sup_vec.fit_transform(X)
-sup_vec.get_feature_names_out()
+table_vec.fit_transform(X)
+table_vec.get_feature_names_out()
 
 ###############################################################################
 # We can see that the |TV| is indeed using
 # a |DtE| for the datetime features.
-pprint(sup_vec.transformers_)
+pprint(table_vec.transformers_)
 
 ###############################################################################
 # Predictions with date features
@@ -113,11 +113,11 @@
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.pipeline import make_pipeline
 
-sup_vec = TableVectorizer(
+table_vec = TableVectorizer(
  datetime_transformer=DatetimeEncoder(add_day_of_the_week=True),
 )
 reg = HistGradientBoostingRegressor()
-pipeline = make_pipeline(sup_vec, reg)
+pipeline = make_pipeline(table_vec, reg)
 
 ###############################################################################
 # Evaluating the model
@@ -226,13 +226,13 @@
 ###############################################################################
 from sklearn.inspection import permutation_importance
 
-sup_vec = TableVectorizer(
+table_vec = TableVectorizer(
  datetime_transformer=DatetimeEncoder(add_day_of_the_week=True),
 )
 
 # In this case, we don't use a pipeline, because we want to compute the
 # importance of the features created by the DatetimeEncoder
-X_ = sup_vec.fit_transform(X)
+X_ = table_vec.fit_transform(X)
 reg = HistGradientBoostingRegressor().fit(X_, y)
 result = permutation_importance(reg, X_, y, n_repeats=10, random_state=0)
 std = result.importances_std
@@ -244,7 +244,7 @@
 plt.figure(figsize=(12, 9))
 plt.title("Feature importances")
 n = len(indices)
-labels = np.array(sup_vec.get_feature_names_out())[indices]
+labels = np.array(table_vec.get_feature_names_out())[indices]
 plt.barh(range(n), importances[indices], color="b", yerr=std[indices])
 plt.yticks(range(n), labels, size=15)
 plt.tight_layout(pad=1)