Skip to content

Commit

Permalink
remove group names
Browse files Browse the repository at this point in the history
  • Loading branch information
tvdboom committed Jul 4, 2023
1 parent 969cc80 commit fb9bb87
Show file tree
Hide file tree
Showing 10 changed files with 29 additions and 97 deletions.
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/bug_report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ body:
----> 7 s = setup(data, target = 'Wrong Class variable')
1 frames
/usr/local/lib/python3.7/dist-packages/pycaret/internal/tabular.py in setup(data, target, ml_usecase, available_plots, train_size, test_data, preprocess, imputation_type, iterative_imputation_iters, categorical_features, categorical_imputation, categorical_iterative_imputer, ordinal_features, high_cardinality_features, high_cardinality_method, numeric_features, numeric_imputation, numeric_iterative_imputer, date_features, ignore_features, normalize, normalize_method, transformation, transformation_method, handle_unknown_categorical, unknown_categorical_method, pca, pca_method, pca_components, ignore_low_variance, combine_rare_levels, rare_level_threshold, bin_numeric_features, remove_outliers, outliers_threshold, remove_multicollinearity, multicollinearity_threshold, remove_perfect_collinearity, create_clusters, cluster_iter, polynomial_features, polynomial_degree, trigonometry_features, polynomial_threshold, group_features, group_names, feature_selection, feature_selection_threshold, feature_selection_method, feature_interaction, feature_ratio, interaction_threshold, fix_imbalance, fix_imbalance_method, transform_target, transform_target_method, data_split_shuffle, data_split_stratify, fold_strategy, fold, fold_shuffle, fold_groups, n_jobs, use_gpu, custom_pipeline, html, session_id, log_experiment, experiment_name, experiment_custom_tags, log_plots, log_profile, log_data, silent, verbose, profile, profile_kwargs, display)
/usr/local/lib/python3.7/dist-packages/pycaret/internal/tabular.py in setup(data, target, ml_usecase, available_plots, train_size, test_data, preprocess, imputation_type, iterative_imputation_iters, categorical_features, categorical_imputation, categorical_iterative_imputer, ordinal_features, high_cardinality_features, high_cardinality_method, numeric_features, numeric_imputation, numeric_iterative_imputer, date_features, ignore_features, normalize, normalize_method, transformation, transformation_method, handle_unknown_categorical, unknown_categorical_method, pca, pca_method, pca_components, ignore_low_variance, combine_rare_levels, rare_level_threshold, bin_numeric_features, remove_outliers, outliers_threshold, remove_multicollinearity, multicollinearity_threshold, remove_perfect_collinearity, create_clusters, cluster_iter, polynomial_features, polynomial_degree, trigonometry_features, polynomial_threshold, group_features, feature_selection, feature_selection_threshold, feature_selection_method, feature_interaction, feature_ratio, interaction_threshold, fix_imbalance, fix_imbalance_method, transform_target, transform_target_method, data_split_shuffle, data_split_stratify, fold_strategy, fold, fold_shuffle, fold_groups, n_jobs, use_gpu, custom_pipeline, html, session_id, log_experiment, experiment_name, experiment_custom_tags, log_plots, log_profile, log_data, silent, verbose, profile, profile_kwargs, display)
330 if not _is_unsupervised(ml_usecase) and target not in data.columns:
331 raise ValueError(
--> 332 f"Target parameter: {target} does not exist in the data provided."
Expand Down
14 changes: 3 additions & 11 deletions pycaret/anomaly/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ def setup(
polynomial_degree: int = 2,
low_variance_threshold: Optional[float] = None,
group_features: Optional[list] = None,
group_names: Optional[Union[str, list]] = None,
drop_groups: bool = False,
remove_multicollinearity: bool = False,
multicollinearity_threshold: float = 0.9,
Expand Down Expand Up @@ -234,18 +233,12 @@ def setup(
this transformation step.
group_features: list, list of lists or None, default = None
group_features: dict or None, default = None
When the dataset contains features with related characteristics,
add new fetaures with the following statistical properties of that
group: min, max, mean, std, median and mode. The parameter takes a
list of feature names or a list of lists of feature names to specify
multiple groups.
group_names: str, list, or None, default = None
Group names to be used when naming the new features. The length
should match with the number of groups specified in ``group_features``.
If None, new features are named using the default form, e.g. group_1,
group_2, etc... Ignored when ``group_features`` is None.
dict with the group name as key and a list of feature names
belonging to that group as value.
drop_groups: bool, default=False
Expand Down Expand Up @@ -471,7 +464,6 @@ def setup(
polynomial_degree=polynomial_degree,
low_variance_threshold=low_variance_threshold,
group_features=group_features,
group_names=group_names,
drop_groups=drop_groups,
remove_multicollinearity=remove_multicollinearity,
multicollinearity_threshold=multicollinearity_threshold,
Expand Down
14 changes: 3 additions & 11 deletions pycaret/classification/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ def setup(
polynomial_degree: int = 2,
low_variance_threshold: Optional[float] = None,
group_features: Optional[list] = None,
group_names: Optional[Union[str, list]] = None,
drop_groups: bool = False,
remove_multicollinearity: bool = False,
multicollinearity_threshold: float = 0.9,
Expand Down Expand Up @@ -297,18 +296,12 @@ def setup(
this transformation step.
group_features: list, list of lists or None, default = None
group_features: dict or None, default = None
When the dataset contains features with related characteristics,
add new fetaures with the following statistical properties of that
group: min, max, mean, std, median and mode. The parameter takes a
list of feature names or a list of lists of feature names to specify
multiple groups.
group_names: str, list, or None, default = None
Group names to be used when naming the new features. The length
should match with the number of groups specified in ``group_features``.
If None, new features are named using the default form, e.g. group_1,
group_2, etc... Ignored when ``group_features`` is None.
dict with the group name as key and a list of feature names
belonging to that group as value.
drop_groups: bool, default=False
Expand Down Expand Up @@ -630,7 +623,6 @@ def setup(
polynomial_degree=polynomial_degree,
low_variance_threshold=low_variance_threshold,
group_features=group_features,
group_names=group_names,
drop_groups=drop_groups,
remove_multicollinearity=remove_multicollinearity,
multicollinearity_threshold=multicollinearity_threshold,
Expand Down
15 changes: 4 additions & 11 deletions pycaret/classification/oop.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@ def setup(
polynomial_degree: int = 2,
low_variance_threshold: Optional[float] = None,
group_features: Optional[list] = None,
group_names: Optional[Union[str, list]] = None,
drop_groups: bool = False,
remove_multicollinearity: bool = False,
multicollinearity_threshold: float = 0.9,
Expand Down Expand Up @@ -391,20 +390,14 @@ def setup(
this transformation step.
group_features: list, list of lists or None, default = None
group_features: dict or None, default = None
When the dataset contains features with related characteristics,
add new fetaures with the following statistical properties of that
group: min, max, mean, std, median and mode. The parameter takes a
list of feature names or a list of lists of feature names to specify
multiple groups.
dict with the group name as key and a list of feature names
belonging to that group as value.
group_names: str, list, or None, default = None
Group names to be used when naming the new features. The length
should match with the number of groups specified in ``group_features``.
If None, new features are named using the default form, e.g. group_1,
group_2, etc... Ignored when ``group_features`` is None.
drop_groups: bool, default=False
Whether to drop the original features in the group. Ignored when
``group_features`` is None.
Expand Down Expand Up @@ -841,7 +834,7 @@ def setup(

# Get statistical properties of a group of features
if group_features:
self._group_features(group_features, group_names, drop_groups)
self._group_features(group_features, drop_groups)

# Drop features that are collinear with other features
if remove_multicollinearity:
Expand Down
19 changes: 2 additions & 17 deletions pycaret/internal/preprocess/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -753,27 +753,12 @@ def _low_variance(self, low_variance_threshold):

self.pipeline.steps.append(("low_variance", variance_estimator))

def _group_features(self, group_features, group_names, drop_groups):
def _group_features(self, group_features, drop_groups):
"""Get statistical properties of a group of features."""
self.logger.info("Set up feature grouping.")

# Convert a single group to sequence
if np.array(group_features).ndim == 1:
group_features = [group_features]

if group_names:
if isinstance(group_names, str):
group_names = [group_names]

if len(group_names) != len(group_features):
raise ValueError(
"Invalid value for the group_names parameter. Length "
f"({len(group_names)}) does not match with length of "
f"group_features ({len(group_features)})."
)

grouping_estimator = TransformerWrapper(
transformer=GroupFeatures(group_features, group_names, drop_groups),
transformer=GroupFeatures(group_features, drop_groups),
exclude=self._fxs["Keep"],
)

Expand Down
10 changes: 2 additions & 8 deletions pycaret/internal/preprocess/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,21 +410,15 @@ class GroupFeatures(BaseEstimator, TransformerMixin):
"""

def __init__(self, group_features, group_names=None, drop_groups=False):
def __init__(self, group_features, drop_groups=False):
self.group_features = group_features
self.group_names = group_names
self.drop_groups = drop_groups

def fit(self, X, y=None):
return self

def transform(self, X, y=None):
if not self.group_names:
self.group_names = [
f"group_{i}" for i in range(1, len(self.group_features) + 1)
]

for name, group in zip(self.group_names, self.group_features):
for name, group in self.group_features.items():
# Drop columns that are not in the dataframe (can be excluded)
group = [g for g in group if g in X]

Expand Down
15 changes: 4 additions & 11 deletions pycaret/internal/pycaret_experiment/unsupervised_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ def setup(
polynomial_degree: int = 2,
low_variance_threshold: Optional[float] = None,
group_features: Optional[list] = None,
group_names: Optional[Union[str, list]] = None,
drop_groups: bool = False,
remove_multicollinearity: bool = False,
multicollinearity_threshold: float = 0.9,
Expand Down Expand Up @@ -300,20 +299,14 @@ def setup(
this transformation step.
group_features: list, list of lists or None, default = None
group_features: dict or None, default = None
When the dataset contains features with related characteristics,
add new fetaures with the following statistical properties of that
group: min, max, mean, std, median and mode. The parameter takes a
list of feature names or a list of lists of feature names to specify
multiple groups.
dict with the group name as key and a list of feature names
belonging to that group as value.
group_names: str, list, or None, default = None
Group names to be used when naming the new features. The length
should match with the number of groups specified in ``group_features``.
If None, new features are named using the default form, e.g. group_1,
group_2, etc... Ignored when ``group_features`` is None.
drop_groups: bool, default=False
Whether to drop the original features in the group. Ignored when
``group_features`` is None.
Expand Down Expand Up @@ -621,7 +614,7 @@ def setup(

# Get statistical properties of a group of features
if group_features:
self._group_features(group_features, group_names, drop_groups)
self._group_features(group_features, drop_groups)

# Drop features that are collinear with other features
if remove_multicollinearity:
Expand Down
14 changes: 3 additions & 11 deletions pycaret/regression/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ def setup(
polynomial_degree: int = 2,
low_variance_threshold: Optional[float] = None,
group_features: Optional[list] = None,
group_names: Optional[Union[str, list]] = None,
drop_groups: bool = False,
remove_multicollinearity: bool = False,
multicollinearity_threshold: float = 0.9,
Expand Down Expand Up @@ -297,18 +296,12 @@ def setup(
this transformation step.
group_features: list, list of lists or None, default = None
group_features: dict or None, default = None
When the dataset contains features with related characteristics,
add new fetaures with the following statistical properties of that
group: min, max, mean, std, median and mode. The parameter takes a
list of feature names or a list of lists of feature names to specify
multiple groups.
group_names: str, list, or None, default = None
Group names to be used when naming the new features. The length
should match with the number of groups specified in ``group_features``.
If None, new features are named using the default form, e.g. group_1,
group_2, etc... Ignored when ``group_features`` is None.
dict with the group name as key and a list of feature names
belonging to that group as value.
drop_groups: bool, default=False
Expand Down Expand Up @@ -628,7 +621,6 @@ def setup(
polynomial_degree=polynomial_degree,
low_variance_threshold=low_variance_threshold,
group_features=group_features,
group_names=group_names,
drop_groups=drop_groups,
remove_multicollinearity=remove_multicollinearity,
multicollinearity_threshold=multicollinearity_threshold,
Expand Down
15 changes: 4 additions & 11 deletions pycaret/regression/oop.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ def setup(
polynomial_degree: int = 2,
low_variance_threshold: Optional[float] = None,
group_features: Optional[list] = None,
group_names: Optional[Union[str, list]] = None,
drop_groups: bool = False,
remove_multicollinearity: bool = False,
multicollinearity_threshold: float = 0.9,
Expand Down Expand Up @@ -353,20 +352,14 @@ def setup(
this transformation step.
group_features: list, list of lists or None, default = None
group_features: dict or None, default = None
When the dataset contains features with related characteristics,
add new fetaures with the following statistical properties of that
group: min, max, mean, std, median and mode. The parameter takes a
list of feature names or a list of lists of feature names to specify
multiple groups.
dict with the group name as key and a list of feature names
belonging to that group as value.
group_names: str, list, or None, default = None
Group names to be used when naming the new features. The length
should match with the number of groups specified in ``group_features``.
If None, new features are named using the default form, e.g. group_1,
group_2, etc... Ignored when ``group_features`` is None.
drop_groups: bool, default=False
Whether to drop the original features in the group. Ignored when
``group_features`` is None.
Expand Down Expand Up @@ -807,7 +800,7 @@ def setup(

# Get statistical properties of a group of features
if group_features:
self._group_features(group_features, group_names, drop_groups)
self._group_features(group_features, drop_groups)

# Drop features that are collinear with other features
if remove_multicollinearity:
Expand Down
8 changes: 3 additions & 5 deletions tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,20 +373,18 @@ def test_low_variance_threshold():
def test_feature_grouping(drop_groups):
"""Assert that feature groups are replaced for stats."""
data = pycaret.datasets.get_data("juice")
group_features = [list(data.columns[:2]), list(data.columns[3:5])]
pc = pycaret.classification.setup(
data=data,
target="STORE",
group_features=group_features,
group_names=["gr1", "gr2"],
group_features={"gr1": list(data.columns[:2]), "gr2": list(data.columns[3:5])},
drop_groups=drop_groups,
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert "mean(gr1)" in X and "median(gr2)" in X
if drop_groups:
assert all(all(column not in X for column in group) for group in group_features)
assert all(all(column not in X for column in group) for group in ("gr1", "gr2"))
else:
assert all(all(column in X for column in group) for group in group_features)
assert all(all(column in X for column in group) for group in ("gr1", "gr2"))


def test_remove_multicollinearity():
Expand Down

0 comments on commit fb9bb87

Please sign in to comment.