ENH Add Categorical support for HistGradientBoosting (scikit-learn#18394

) Co-authored-by: Nicolas Hug <[email protected]> Co-authored-by: Olivier Grisel <[email protected]> Co-authored-by: Olivier Grisel <[email protected]>
albertcthomas · Nov 16, 2020 · b4453f1 · b4453f1
1 parent 04c080a
commit b4453f1
Show file tree

Hide file tree

Showing 24 changed files with 2,206 additions and 182 deletions.
diff --git a/benchmarks/bench_hist_gradient_boosting_adult.py b/benchmarks/bench_hist_gradient_boosting_adult.py
@@ -0,0 +1,90 @@
+import argparse
+from time import time
+
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import fetch_openml
+from sklearn.metrics import accuracy_score, roc_auc_score
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.ensemble._hist_gradient_boosting.utils import (
+    get_equivalent_estimator)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--n-leaf-nodes', type=int, default=31)
+parser.add_argument('--n-trees', type=int, default=100)
+parser.add_argument('--lightgbm', action="store_true", default=False)
+parser.add_argument('--learning-rate', type=float, default=.1)
+parser.add_argument('--max-bins', type=int, default=255)
+parser.add_argument('--no-predict', action="store_true", default=False)
+parser.add_argument('--verbose', action="store_true", default=False)
+args = parser.parse_args()
+
+n_leaf_nodes = args.n_leaf_nodes
+n_trees = args.n_trees
+lr = args.learning_rate
+max_bins = args.max_bins
+verbose = args.verbose
+
+
+def fit(est, data_train, target_train, libname, **fit_params):
+    print(f"Fitting a {libname} model...")
+    tic = time()
+    est.fit(data_train, target_train, **fit_params)
+    toc = time()
+    print(f"fitted in {toc - tic:.3f}s")
+
+
+def predict(est, data_test, target_test):
+    if args.no_predict:
+        return
+    tic = time()
+    predicted_test = est.predict(data_test)
+    predicted_proba_test = est.predict_proba(data_test)
+    toc = time()
+    roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
+    acc = accuracy_score(target_test, predicted_test)
+    print(f"predicted in {toc - tic:.3f}s, "
+          f"ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
+
+
+data = fetch_openml(data_id=179, as_frame=False)  # adult dataset
+X, y = data.data, data.target
+
+n_features = X.shape[1]
+n_categorical_features = len(data.categories)
+n_numerical_features = n_features - n_categorical_features
+print(f"Number of features: {n_features}")
+print(f"Number of categorical features: {n_categorical_features}")
+print(f"Number of numerical features: {n_numerical_features}")
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,
+                                                    random_state=0)
+
+# Note: no need to use an OrdinalEncoder because categorical features are
+# already clean
+is_categorical = [name in data.categories for name in data.feature_names]
+est = HistGradientBoostingClassifier(
+    loss='binary_crossentropy',
+    learning_rate=lr,
+    max_iter=n_trees,
+    max_bins=max_bins,
+    max_leaf_nodes=n_leaf_nodes,
+    categorical_features=is_categorical,
+    early_stopping=False,
+    random_state=0,
+    verbose=verbose
+)
+
+fit(est, X_train, y_train, 'sklearn')
+predict(est, X_test, y_test)
+
+if args.lightgbm:
+    est = get_equivalent_estimator(est, lib='lightgbm')
+    est.set_params(max_cat_to_onehot=1)  # dont use OHE
+    categorical_features = [f_idx
+                            for (f_idx, is_cat) in enumerate(is_categorical)
+                            if is_cat]
+    fit(est, X_train, y_train, 'lightgbm',
+        categorical_feature=categorical_features)
+    predict(est, X_test, y_test)
diff --git a/benchmarks/bench_hist_gradient_boosting_categorical_only.py b/benchmarks/bench_hist_gradient_boosting_categorical_only.py
@@ -0,0 +1,84 @@
+import argparse
+from time import time
+
+from sklearn.preprocessing import KBinsDiscretizer
+from sklearn.datasets import make_classification
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.ensemble._hist_gradient_boosting.utils import (
+    get_equivalent_estimator)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--n-leaf-nodes', type=int, default=31)
+parser.add_argument('--n-trees', type=int, default=100)
+parser.add_argument('--n-features', type=int, default=20)
+parser.add_argument('--n-cats', type=int, default=20)
+parser.add_argument('--n-samples', type=int, default=10_000)
+parser.add_argument('--lightgbm', action="store_true", default=False)
+parser.add_argument('--learning-rate', type=float, default=.1)
+parser.add_argument('--max-bins', type=int, default=255)
+parser.add_argument('--no-predict', action="store_true", default=False)
+parser.add_argument('--verbose', action="store_true", default=False)
+args = parser.parse_args()
+
+n_leaf_nodes = args.n_leaf_nodes
+n_features = args.n_features
+n_categories = args.n_cats
+n_samples = args.n_samples
+n_trees = args.n_trees
+lr = args.learning_rate
+max_bins = args.max_bins
+verbose = args.verbose
+
+
+def fit(est, data_train, target_train, libname, **fit_params):
+    print(f"Fitting a {libname} model...")
+    tic = time()
+    est.fit(data_train, target_train, **fit_params)
+    toc = time()
+    print(f"fitted in {toc - tic:.3f}s")
+
+
+def predict(est, data_test):
+    # We don't report accuracy or ROC because the dataset doesn't really make
+    # sense: we treat ordered features as un-ordered categories.
+    if args.no_predict:
+        return
+    tic = time()
+    est.predict(data_test)
+    toc = time()
+    print(f"predicted in {toc - tic:.3f}s")
+
+
+X, y = make_classification(n_samples=n_samples, n_features=n_features,
+                           random_state=0)
+
+X = KBinsDiscretizer(n_bins=n_categories, encode='ordinal').fit_transform(X)
+
+print(f"Number of features: {n_features}")
+print(f"Number of samples: {n_samples}")
+
+is_categorical = [True] * n_features
+est = HistGradientBoostingClassifier(
+    loss='binary_crossentropy',
+    learning_rate=lr,
+    max_iter=n_trees,
+    max_bins=max_bins,
+    max_leaf_nodes=n_leaf_nodes,
+    categorical_features=is_categorical,
+    early_stopping=False,
+    random_state=0,
+    verbose=verbose
+)
+
+fit(est, X, y, 'sklearn')
+predict(est, X)
+
+if args.lightgbm:
+    est = get_equivalent_estimator(est, lib='lightgbm')
+    est.set_params(max_cat_to_onehot=1)  # dont use OHE
+    categorical_features = list(range(n_features))
+    fit(est, X, y, 'lightgbm',
+        categorical_feature=categorical_features)
+    predict(est, X)
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
@@ -1051,6 +1051,68 @@ multiplying the gradients (and the hessians) by the sample weights. Note that
 the binning stage (specifically the quantiles computation) does not take the
 weights into account.
 
+.. _categorical_support_gbdt:
+
+Categorical Features Support
+----------------------------
+
+:class:`HistGradientBoostingClassifier` and
+:class:`HistGradientBoostingRegressor` have native support for categorical
+features: they can consider splits on non-ordered, categorical data.
+
+For datasets with categorical features, using the native categorical support
+is often better than relying on one-hot encoding
+(:class:`~sklearn.preprocessing.OneHotEncoder`), because one-hot encoding
+requires more tree depth to achieve equivalent splits. It is also usually
+better to rely on the native categorical support rather than to treat
+categorical features as continuous (ordinal), which happens for ordinal-encoded
+categorical data, since categories are nominal quantities where order does not
+matter.
+
+To enable categorical support, a boolean mask can be passed to the
+`categorical_features` parameter, indicating which feature is categorical. In
+the following, the first feature will be treated as categorical and the
+second feature as numerical::
+
+  >>> gbdt = HistGradientBoostingClassifier(categorical_features=[True, False])
+
+Equivalently, one can pass a list of integers indicating the indices of the
+categorical features::
+
+  >>> gbdt = HistGradientBoostingClassifier(categorical_features=[0])
+
+The cardinality of each categorical feature should be less than the `max_bins`
+parameter, and each categorical feature is expected to be encoded in
+`[0, max_bins - 1]`. To that end, it might be useful to pre-process the data
+with an :class:`~sklearn.preprocessing.OrdinalEncoder` as done in
+:ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`.
+
+If there are missing values during training, the missing values will be
+treated as a proper category. If there are no missing values during training,
+then at prediction time, missing values are mapped to the child node that has
+the most samples (just like for continuous features). When predicting,
+categories that were not seen during fit time will be treated as missing
+values.
+
+**Split finding with categorical features**: The canonical way of considering
+categorical splits in a tree is to consider
+all of the :math:`2^{K - 1} - 1` partitions, where :math:`K` is the number of
+categories. This can quickly become prohibitive when :math:`K` is large.
+Fortunately, since gradient boosting trees are always regression trees (even
+for classification problems), there exist a faster strategy that can yield
+equivalent splits. First, the categories of a feature are sorted according to
+the variance of the target, for each category `k`. Once the categories are
+sorted, one can consider *continuous partitions*, i.e. treat the categories
+as if they were ordered continuous values (see Fisher [Fisher1958]_ for a
+formal proof). As a result, only :math:`K - 1` splits need to be considered
+instead of :math:`2^{K - 1} - 1`. The initial sorting is a
+:math:`\mathcal{O}(K \log(K))` operation, leading to a total complexity of
+:math:`\mathcal{O}(K \log(K) + K)`, instead of :math:`\mathcal{O}(2^K)`.
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`
+
 .. _monotonic_cst_gbdt:
 
 Monotonic Constraints
@@ -1092,6 +1154,10 @@ that the feature is supposed to have a positive / negative effect on the
 probability to belong to the positive class. Monotonic constraints are not
 supported for multiclass context.
 
+.. note::
+    Since categories are unordered quantities, it is not possible to enforce
+    monotonic constraints on categorical features.
+
 .. topic:: Examples:
 
   * :ref:`sphx_glr_auto_examples_ensemble_plot_monotonic_constraints.py`
@@ -1158,6 +1224,8 @@ Finally, many parts of the implementation of
   .. [LightGBM] Ke et. al. `"LightGBM: A Highly Efficient Gradient
      BoostingDecision Tree" <https://papers.nips.cc/paper/
      6907-lightgbm-a-highly-efficient-gradient-boosting-decision-tree>`_
+  .. [Fisher1958] Walter D. Fisher. `"On Grouping for Maximum Homogeneity"
+     <http://www.csiss.org/SPACE/workshops/2004/SAC/files/fisher.pdf>`_
 
 .. _voting_classifier:
 

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
@@ -242,6 +242,11 @@ Changelog
 :mod:`sklearn.ensemble`
 .......................
 
+- |MajorFeature| :class:`ensemble.HistGradientBoostingRegressor` and
+  :class:`ensemble.HistGradientBoostingClassifier` now have native
+  support for categorical features with the `categorical_features`
+  parameter. :pr:`18394` by `Nicolas Hug`_ and `Thomas Fan`_.
+
 - |Feature| :class:`ensemble.HistGradientBoostingRegressor` and
   :class:`ensemble.HistGradientBoostingClassifier` now support the
   method `staged_predict`, which allows monitoring of each stage.