remove group names

jerryming1995 · Jul 4, 2023 · fb9bb87 · fb9bb87
1 parent 969cc80
commit fb9bb87
Show file tree

Hide file tree

Showing 10 changed files with 29 additions and 97 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -92,7 +92,7 @@ body:
         ----> 7 s = setup(data, target = 'Wrong Class variable')
 
         1 frames
-        /usr/local/lib/python3.7/dist-packages/pycaret/internal/tabular.py in setup(data, target, ml_usecase, available_plots, train_size, test_data, preprocess, imputation_type, iterative_imputation_iters, categorical_features, categorical_imputation, categorical_iterative_imputer, ordinal_features, high_cardinality_features, high_cardinality_method, numeric_features, numeric_imputation, numeric_iterative_imputer, date_features, ignore_features, normalize, normalize_method, transformation, transformation_method, handle_unknown_categorical, unknown_categorical_method, pca, pca_method, pca_components, ignore_low_variance, combine_rare_levels, rare_level_threshold, bin_numeric_features, remove_outliers, outliers_threshold, remove_multicollinearity, multicollinearity_threshold, remove_perfect_collinearity, create_clusters, cluster_iter, polynomial_features, polynomial_degree, trigonometry_features, polynomial_threshold, group_features, group_names, feature_selection, feature_selection_threshold, feature_selection_method, feature_interaction, feature_ratio, interaction_threshold, fix_imbalance, fix_imbalance_method, transform_target, transform_target_method, data_split_shuffle, data_split_stratify, fold_strategy, fold, fold_shuffle, fold_groups, n_jobs, use_gpu, custom_pipeline, html, session_id, log_experiment, experiment_name, experiment_custom_tags, log_plots, log_profile, log_data, silent, verbose, profile, profile_kwargs, display)
+        /usr/local/lib/python3.7/dist-packages/pycaret/internal/tabular.py in setup(data, target, ml_usecase, available_plots, train_size, test_data, preprocess, imputation_type, iterative_imputation_iters, categorical_features, categorical_imputation, categorical_iterative_imputer, ordinal_features, high_cardinality_features, high_cardinality_method, numeric_features, numeric_imputation, numeric_iterative_imputer, date_features, ignore_features, normalize, normalize_method, transformation, transformation_method, handle_unknown_categorical, unknown_categorical_method, pca, pca_method, pca_components, ignore_low_variance, combine_rare_levels, rare_level_threshold, bin_numeric_features, remove_outliers, outliers_threshold, remove_multicollinearity, multicollinearity_threshold, remove_perfect_collinearity, create_clusters, cluster_iter, polynomial_features, polynomial_degree, trigonometry_features, polynomial_threshold, group_features, feature_selection, feature_selection_threshold, feature_selection_method, feature_interaction, feature_ratio, interaction_threshold, fix_imbalance, fix_imbalance_method, transform_target, transform_target_method, data_split_shuffle, data_split_stratify, fold_strategy, fold, fold_shuffle, fold_groups, n_jobs, use_gpu, custom_pipeline, html, session_id, log_experiment, experiment_name, experiment_custom_tags, log_plots, log_profile, log_data, silent, verbose, profile, profile_kwargs, display)
             330     if not _is_unsupervised(ml_usecase) and target not in data.columns:
             331         raise ValueError(
         --> 332             f"Target parameter: {target} does not exist in the data provided."

diff --git a/pycaret/anomaly/functional.py b/pycaret/anomaly/functional.py
@@ -45,7 +45,6 @@ def setup(
     polynomial_degree: int = 2,
     low_variance_threshold: Optional[float] = None,
     group_features: Optional[list] = None,
-    group_names: Optional[Union[str, list]] = None,
     drop_groups: bool = False,
     remove_multicollinearity: bool = False,
     multicollinearity_threshold: float = 0.9,
@@ -234,18 +233,12 @@ def setup(
         this transformation step.
 
 
-    group_features: list, list of lists or None, default = None
+    group_features: dict or None, default = None
         When the dataset contains features with related characteristics,
         add new fetaures with the following statistical properties of that
         group: min, max, mean, std, median and mode. The parameter takes a
-        list of feature names or a list of lists of feature names to specify
-        multiple groups.
-
-    group_names: str, list, or None, default = None
-        Group names to be used when naming the new features. The length
-        should match with the number of groups specified in ``group_features``.
-        If None, new features are named using the default form, e.g. group_1,
-        group_2, etc... Ignored when ``group_features`` is None.
+        dict with the group name as key and a list of feature names
+        belonging to that group as value.
 
 
     drop_groups: bool, default=False
@@ -471,7 +464,6 @@ def setup(
         polynomial_degree=polynomial_degree,
         low_variance_threshold=low_variance_threshold,
         group_features=group_features,
-        group_names=group_names,
         drop_groups=drop_groups,
         remove_multicollinearity=remove_multicollinearity,
         multicollinearity_threshold=multicollinearity_threshold,

diff --git a/pycaret/classification/functional.py b/pycaret/classification/functional.py
@@ -52,7 +52,6 @@ def setup(
     polynomial_degree: int = 2,
     low_variance_threshold: Optional[float] = None,
     group_features: Optional[list] = None,
-    group_names: Optional[Union[str, list]] = None,
     drop_groups: bool = False,
     remove_multicollinearity: bool = False,
     multicollinearity_threshold: float = 0.9,
@@ -297,18 +296,12 @@ def setup(
         this transformation step.
 
 
-    group_features: list, list of lists or None, default = None
+    group_features: dict or None, default = None
         When the dataset contains features with related characteristics,
         add new fetaures with the following statistical properties of that
         group: min, max, mean, std, median and mode. The parameter takes a
-        list of feature names or a list of lists of feature names to specify
-        multiple groups.
-
-    group_names: str, list, or None, default = None
-        Group names to be used when naming the new features. The length
-        should match with the number of groups specified in ``group_features``.
-        If None, new features are named using the default form, e.g. group_1,
-        group_2, etc... Ignored when ``group_features`` is None.
+        dict with the group name as key and a list of feature names
+        belonging to that group as value.
 
 
     drop_groups: bool, default=False
@@ -630,7 +623,6 @@ def setup(
         polynomial_degree=polynomial_degree,
         low_variance_threshold=low_variance_threshold,
         group_features=group_features,
-        group_names=group_names,
         drop_groups=drop_groups,
         remove_multicollinearity=remove_multicollinearity,
         multicollinearity_threshold=multicollinearity_threshold,

diff --git a/pycaret/classification/oop.py b/pycaret/classification/oop.py
@@ -143,7 +143,6 @@ def setup(
         polynomial_degree: int = 2,
         low_variance_threshold: Optional[float] = None,
         group_features: Optional[list] = None,
-        group_names: Optional[Union[str, list]] = None,
         drop_groups: bool = False,
         remove_multicollinearity: bool = False,
         multicollinearity_threshold: float = 0.9,
@@ -391,20 +390,14 @@ def setup(
             this transformation step.
 
 
-        group_features: list, list of lists or None, default = None
+        group_features: dict or None, default = None
             When the dataset contains features with related characteristics,
             add new fetaures with the following statistical properties of that
             group: min, max, mean, std, median and mode. The parameter takes a
-            list of feature names or a list of lists of feature names to specify
-            multiple groups.
+            dict with the group name as key and a list of feature names
+            belonging to that group as value.
 
 
-        group_names: str, list, or None, default = None
-            Group names to be used when naming the new features. The length
-            should match with the number of groups specified in ``group_features``.
-            If None, new features are named using the default form, e.g. group_1,
-            group_2, etc... Ignored when ``group_features`` is None.
-
         drop_groups: bool, default=False
             Whether to drop the original features in the group. Ignored when
             ``group_features`` is None.
@@ -841,7 +834,7 @@ def setup(
 
             # Get statistical properties of a group of features
             if group_features:
-                self._group_features(group_features, group_names, drop_groups)
+                self._group_features(group_features, drop_groups)
 
             # Drop features that are collinear with other features
             if remove_multicollinearity:

diff --git a/pycaret/internal/preprocess/preprocessor.py b/pycaret/internal/preprocess/preprocessor.py
@@ -753,27 +753,12 @@ def _low_variance(self, low_variance_threshold):
 
         self.pipeline.steps.append(("low_variance", variance_estimator))
 
-    def _group_features(self, group_features, group_names, drop_groups):
+    def _group_features(self, group_features, drop_groups):
         """Get statistical properties of a group of features."""
         self.logger.info("Set up feature grouping.")
 
-        # Convert a single group to sequence
-        if np.array(group_features).ndim == 1:
-            group_features = [group_features]
-
-        if group_names:
-            if isinstance(group_names, str):
-                group_names = [group_names]
-
-            if len(group_names) != len(group_features):
-                raise ValueError(
-                    "Invalid value for the group_names parameter. Length "
-                    f"({len(group_names)}) does not match with length of "
-                    f"group_features ({len(group_features)})."
-                )
-
         grouping_estimator = TransformerWrapper(
-            transformer=GroupFeatures(group_features, group_names, drop_groups),
+            transformer=GroupFeatures(group_features, drop_groups),
             exclude=self._fxs["Keep"],
         )
 

diff --git a/pycaret/internal/preprocess/transformers.py b/pycaret/internal/preprocess/transformers.py
@@ -410,21 +410,15 @@ class GroupFeatures(BaseEstimator, TransformerMixin):
 
     """
 
-    def __init__(self, group_features, group_names=None, drop_groups=False):
+    def __init__(self, group_features, drop_groups=False):
         self.group_features = group_features
-        self.group_names = group_names
         self.drop_groups = drop_groups
 
     def fit(self, X, y=None):
         return self
 
     def transform(self, X, y=None):
-        if not self.group_names:
-            self.group_names = [
-                f"group_{i}" for i in range(1, len(self.group_features) + 1)
-            ]
-
-        for name, group in zip(self.group_names, self.group_features):
+        for name, group in self.group_features.items():
             # Drop columns that are not in the dataframe (can be excluded)
             group = [g for g in group if g in X]
 

diff --git a/pycaret/internal/pycaret_experiment/unsupervised_experiment.py b/pycaret/internal/pycaret_experiment/unsupervised_experiment.py
@@ -103,7 +103,6 @@ def setup(
         polynomial_degree: int = 2,
         low_variance_threshold: Optional[float] = None,
         group_features: Optional[list] = None,
-        group_names: Optional[Union[str, list]] = None,
         drop_groups: bool = False,
         remove_multicollinearity: bool = False,
         multicollinearity_threshold: float = 0.9,
@@ -300,20 +299,14 @@ def setup(
             this transformation step.
 
 
-        group_features: list, list of lists or None, default = None
+        group_features: dict or None, default = None
             When the dataset contains features with related characteristics,
             add new fetaures with the following statistical properties of that
             group: min, max, mean, std, median and mode. The parameter takes a
-            list of feature names or a list of lists of feature names to specify
-            multiple groups.
+            dict with the group name as key and a list of feature names
+            belonging to that group as value.
 
 
-        group_names: str, list, or None, default = None
-            Group names to be used when naming the new features. The length
-            should match with the number of groups specified in ``group_features``.
-            If None, new features are named using the default form, e.g. group_1,
-            group_2, etc... Ignored when ``group_features`` is None.
-
         drop_groups: bool, default=False
             Whether to drop the original features in the group. Ignored when
             ``group_features`` is None.
@@ -621,7 +614,7 @@ def setup(
 
             # Get statistical properties of a group of features
             if group_features:
-                self._group_features(group_features, group_names, drop_groups)
+                self._group_features(group_features, drop_groups)
 
             # Drop features that are collinear with other features
             if remove_multicollinearity:

diff --git a/pycaret/regression/functional.py b/pycaret/regression/functional.py
@@ -52,7 +52,6 @@ def setup(
     polynomial_degree: int = 2,
     low_variance_threshold: Optional[float] = None,
     group_features: Optional[list] = None,
-    group_names: Optional[Union[str, list]] = None,
     drop_groups: bool = False,
     remove_multicollinearity: bool = False,
     multicollinearity_threshold: float = 0.9,
@@ -297,18 +296,12 @@ def setup(
         this transformation step.
 
 
-    group_features: list, list of lists or None, default = None
+    group_features: dict or None, default = None
         When the dataset contains features with related characteristics,
         add new fetaures with the following statistical properties of that
         group: min, max, mean, std, median and mode. The parameter takes a
-        list of feature names or a list of lists of feature names to specify
-        multiple groups.
-
-    group_names: str, list, or None, default = None
-        Group names to be used when naming the new features. The length
-        should match with the number of groups specified in ``group_features``.
-        If None, new features are named using the default form, e.g. group_1,
-        group_2, etc... Ignored when ``group_features`` is None.
+        dict with the group name as key and a list of feature names
+        belonging to that group as value.
 
 
     drop_groups: bool, default=False
@@ -628,7 +621,6 @@ def setup(
         polynomial_degree=polynomial_degree,
         low_variance_threshold=low_variance_threshold,
         group_features=group_features,
-        group_names=group_names,
         drop_groups=drop_groups,
         remove_multicollinearity=remove_multicollinearity,
         multicollinearity_threshold=multicollinearity_threshold,

diff --git a/pycaret/regression/oop.py b/pycaret/regression/oop.py
@@ -105,7 +105,6 @@ def setup(
         polynomial_degree: int = 2,
         low_variance_threshold: Optional[float] = None,
         group_features: Optional[list] = None,
-        group_names: Optional[Union[str, list]] = None,
         drop_groups: bool = False,
         remove_multicollinearity: bool = False,
         multicollinearity_threshold: float = 0.9,
@@ -353,20 +352,14 @@ def setup(
             this transformation step.
 
 
-        group_features: list, list of lists or None, default = None
+        group_features: dict or None, default = None
             When the dataset contains features with related characteristics,
             add new fetaures with the following statistical properties of that
             group: min, max, mean, std, median and mode. The parameter takes a
-            list of feature names or a list of lists of feature names to specify
-            multiple groups.
+            dict with the group name as key and a list of feature names
+            belonging to that group as value.
 
 
-        group_names: str, list, or None, default = None
-            Group names to be used when naming the new features. The length
-            should match with the number of groups specified in ``group_features``.
-            If None, new features are named using the default form, e.g. group_1,
-            group_2, etc... Ignored when ``group_features`` is None.
-
         drop_groups: bool, default=False
             Whether to drop the original features in the group. Ignored when
             ``group_features`` is None.
@@ -807,7 +800,7 @@ def setup(
 
             # Get statistical properties of a group of features
             if group_features:
-                self._group_features(group_features, group_names, drop_groups)
+                self._group_features(group_features, drop_groups)
 
             # Drop features that are collinear with other features
             if remove_multicollinearity:

diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py
@@ -373,20 +373,18 @@ def test_low_variance_threshold():
 def test_feature_grouping(drop_groups):
     """Assert that feature groups are replaced for stats."""
     data = pycaret.datasets.get_data("juice")
-    group_features = [list(data.columns[:2]), list(data.columns[3:5])]
     pc = pycaret.classification.setup(
         data=data,
         target="STORE",
-        group_features=group_features,
-        group_names=["gr1", "gr2"],
+        group_features={"gr1": list(data.columns[:2]), "gr2": list(data.columns[3:5])},
         drop_groups=drop_groups,
     )
     X, _ = pc.pipeline.transform(pc.X, pc.y)
     assert "mean(gr1)" in X and "median(gr2)" in X
     if drop_groups:
-        assert all(all(column not in X for column in group) for group in group_features)
+        assert all(all(column not in X for column in group) for group in ("gr1", "gr2"))
     else:
-        assert all(all(column in X for column in group) for group in group_features)
+        assert all(all(column in X for column in group) for group in ("gr1", "gr2"))
 
 
 def test_remove_multicollinearity():