forked from pycaret/pycaret
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_preprocess.py
449 lines (372 loc) · 14.9 KB
/
test_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
# coding: utf-8
"""
Package: PyCaret
Author: Mavs
Description: Unit tests for pipeline.py
"""
import numpy as np
import pandas as pd
import pytest
from imblearn.over_sampling import ADASYN
from scipy.sparse import csr_matrix
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pycaret.classification
import pycaret.datasets
import pycaret.regression
def test_select_target_by_index():
"""Assert that the target can be selected by its column index."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(data, target=2)
assert pc.target_param == "WeekofPurchase"
def test_select_target_by_str():
"""Assert that the target can be selected by its column name."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(data, target="WeekofPurchase")
assert pc.target_param == "WeekofPurchase"
def test_nans_in_target_column():
"""Assert that the target can be selected by its column name."""
data = pycaret.datasets.get_data("juice")
data.loc[3, "WeekofPurchase"] = np.nan
with pytest.raises(ValueError, match=r".*missing values found.*"):
pycaret.classification.setup(data, target="WeekofPurchase")
def test_select_target_by_sequence():
"""Assert that the target can be a sequence."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(data, target=[1] * len(data))
assert pc.target_param == "target"
def test_input_is_array():
"""Assert that the input can be a numpy array."""
pc = pycaret.classification.setup(np.eye(4), target=[1, 0, 0, 1])
assert isinstance(pc.dataset, pd.DataFrame)
assert pc.target_param == "target"
def test_input_is_sparse():
"""Assert that the input can be a scipy sparse matrix."""
pc = pycaret.classification.setup(
data=csr_matrix((300, 4)),
target=[1, 0, 1] * 100,
preprocess=False,
)
assert isinstance(pc.dataset, pd.DataFrame)
assert pc.target_param == "target"
def test_assign_index_is_false():
"""Assert that the index is reset when index=False."""
data = pycaret.datasets.get_data("juice")
data.index = list(range(100, len(data) + 100))
pc = pycaret.classification.setup(data, index=False)
assert pc.dataset.index[0] == 0
def test_assign_index_is_true():
"""Assert that the index remains unchanged when index=True."""
data = pycaret.datasets.get_data("juice")
data.index = list(range(100, len(data) + 100))
pc = pycaret.classification.setup(
data=data,
index=True,
data_split_shuffle=False,
data_split_stratify=False,
)
assert pc.dataset.index[0] == 100
@pytest.mark.parametrize("index", [1, "WeekofPurchase", list(range(2, 1072))])
def test_assign_index(index):
"""Assert that the index can be assigned."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(
data=data,
index=index,
data_split_shuffle=False,
data_split_stratify=False,
preprocess=False,
)
assert pc.dataset.index[0] != 0
def test_preprocess_is_False():
"""Assert that preprocessing is skipped when preprocess=False."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(data, preprocess=False)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert X["Purchase"].dtype.kind not in "ifu" # No encoding of categorical columns
def test_ignore_features():
"""Assert that features can be ignored in preprocessing."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(data, ignore_features=["Purchase"])
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert "Purchase" not in X
def test_encode_target():
"""Assert that the target column is automatically encoded."""
data = pycaret.datasets.get_data("telescope")
pc = pycaret.classification.setup(data)
_, y = pc.pipeline.transform(pc.X, pc.y)
assert y.dtype.kind in "ifu"
def test_date_features():
"""Assert that features are extracted from date features."""
data = pycaret.datasets.get_data("juice")
data["date"] = pd.date_range(start="1/1/2018", periods=len(data))
pc = pycaret.classification.setup(data, target=-2, date_features=["date"])
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert all([f"date_{attr}" in X for attr in ("day", "month", "year")])
def test_custom_date_features():
"""Assert that features are extracted from date features."""
data = pycaret.datasets.get_data("juice")
data["date"] = pd.date_range(start="1/1/2018", periods=len(data))
pc = pycaret.classification.setup(
data,
target=-2,
date_features=["date"],
create_date_columns=["quarter"],
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert "date_quarter" in X and "day" not in X
@pytest.mark.parametrize(
"imputation_method", [0, "drop", "mean", "median", "mode", "knn"]
)
def test_simple_numeric_imputation(imputation_method):
"""Assert that missing values are imputed."""
data = pycaret.datasets.get_data("juice")
data.loc[100, "WeekofPurchase"] = np.nan
pc = pycaret.classification.setup(
data=data,
imputation_type="simple",
numeric_iterative_imputer=imputation_method,
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert X.isna().sum().sum() == 0
@pytest.mark.parametrize("imputation_method", ["drop", "missing", "mode"])
def test_simple_categorical_imputation(imputation_method):
"""Assert that missing values are imputed."""
data = pycaret.datasets.get_data("juice")
data.loc[100, "Purchase"] = np.nan
pc = pycaret.classification.setup(
data=data,
imputation_type="simple",
categorical_imputation=imputation_method,
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert X.isna().sum().sum() == 0
def test_iterative_imputer():
"""Test iterative imputer"""
data = pycaret.datasets.get_data("juice")
categories = {}
for i, col in enumerate(data.columns):
# leave two columns and target filled
if col in ("STORE", "PriceCH", "DiscMM"):
continue
if col in ("Purchase", "Store7"):
categories[col] = set(data[col].unique())
data.loc[data.sample(frac=0.1, random_state=i).index, col] = pd.np.nan
for imputer in ["catboost", "lightgbm", "rf", "lr"]:
data = data.copy()
pc = pycaret.classification.setup(
data=data,
imputation_type="iterative",
numeric_iterative_imputer=imputer,
categorical_iterative_imputer=imputer,
)
transformer = pc.pipeline.named_steps["iterative_imputer"]
df = transformer.transform(data, data["STORE"])[0]
assert not df.isnull().values.any()
assert all(categories[col] == set(df[col].unique()) for col in categories)
df = transformer.transform(data, data["STORE"])[0]
assert not df.isnull().values.any()
assert all(categories[col] == set(df[col].unique()) for col in categories)
@pytest.mark.parametrize("embedding_method", ["bow", "tf-idf"])
def test_text_embedding(embedding_method):
"""Assert that text columns are embedded."""
data = pycaret.datasets.get_data("spx")
pc = pycaret.regression.setup(
data=data.iloc[:50, :], # Less rows for faster processing
text_features=["text"],
text_features_method=embedding_method,
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert X.shape[1] > 50 # Text column is now embedding
def test_encoding_ordinal_features():
"""Assert that ordinal features are encoded correctly."""
data = pycaret.datasets.get_data("employee")
pc = pycaret.classification.setup(
data=data,
imputation_type=None,
ordinal_features={"salary": ["low", "medium", "high"]},
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
mapping = pc.pipeline.steps[0][1].transformer.mapping
assert mapping[0]["mapping"]["low"] == 0
assert mapping[0]["mapping"]["medium"] == 1
assert mapping[0]["mapping"]["high"] == 2
def test_encoding_grouping_rare_categories():
"""Assert that rare categories are grouped before encoding."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(data, rare_to_value=0.5)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert "rare" in pc.pipeline.steps[-2][1].transformer.mapping[0]["mapping"]
def test_encoding_categorical_features():
"""Assert that categorical features are encoded correctly."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(data)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert list(sorted(X["Purchase"].unique())) == [0.0, 1.0]
@pytest.mark.parametrize("transformation_method", ["yeo-johnson", "quantile"])
def test_transformation(transformation_method):
"""Assert that features can be transformed to a gaussian distribution."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(
data=data,
transformation=True,
transformation_method=transformation_method,
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert list(X["Purchase"].unique()) != [0.0, 1.0]
@pytest.mark.parametrize("normalize_method", ["zscore", "minmax", "maxabs", "robust"])
def test_normalize(normalize_method):
"""Assert that features can be normalized."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(
data=data,
normalize=True,
normalize_method=normalize_method,
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert X["WeekofPurchase"].max() < 5
def test_low_variance_threshold():
"""Assert that features with low variance are dropped."""
data = pycaret.datasets.get_data("juice")
data["feature"] = 1 # Minimal variance
pc = pycaret.classification.setup(
data=data,
target="STORE",
low_variance_threshold=0,
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert "feature" not in X
def test_feature_grouping():
"""Assert that feature groups are replaced for stats."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(
data=data,
target="STORE",
group_features=[list(data.columns[:2]), list(data.columns[3:5])],
group_names=["gr1", "gr2"],
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert "Id" not in X
assert "mean(gr1)" in X and "median(gr2)" in X
def test_remove_multicollinearity():
"""Assert that one of two collinear features are dropped."""
data = pycaret.datasets.get_data("juice")
data["Id 2"] = list(range(len(data))) # Correlated with Id
pc = pycaret.classification.setup(
data=data,
target="STORE",
remove_multicollinearity=True,
multicollinearity_threshold=0.9999,
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert "Id" in X and "Id 2" not in X
def test_bin_numeric_features():
"""Assert that numeric features can be binned."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(data=data, bin_numeric_features=["Id"])
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert X["Id"].nunique() == 5
@pytest.mark.parametrize("outliers_method", ["iforest", "ee", "lof"])
def test_remove_outliers(outliers_method):
"""Assert that outliers can be removed."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(
data=data,
low_variance_threshold=None,
remove_outliers=True,
outliers_method=outliers_method,
outliers_threshold=0.2,
)
assert pc.pipeline.steps[-1][0] == "remove_outliers"
def test_polynomial_features():
"""Assert that polynomial features can be created."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(
data=data,
polynomial_features=True,
polynomial_degree=2,
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert X.shape[1] > data.shape[1] # Extra features were created
@pytest.mark.parametrize(
"fix_imbalance_method", ["smote", "nearmiss", "SMOTEENN", ADASYN()]
)
def test_fix_imbalance(fix_imbalance_method):
"""Assert that the classes can be balanced."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(
data=data,
low_variance_threshold=None,
fix_imbalance=True,
fix_imbalance_method=fix_imbalance_method,
)
assert pc.pipeline.steps[-1][0] == "balance" # Rows are sampled
@pytest.mark.parametrize("pca_method", ["linear", "kernel", "incremental"])
def test_pca(pca_method):
"""Assert that pca can be applied."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(
data=data,
pca=True,
pca_method=pca_method,
pca_components=10,
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert X.shape[1] == 10
def test_keep_features():
"""Assert that features are not dropped through preprocess."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(
data=data,
keep_features=["Id"],
pca=True,
pca_components=8,
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert "Id" in X
@pytest.mark.parametrize("fs_method", ["univariate", "classic", "sequential"])
def test_feature_selection(fs_method):
"""Assert that feature selection can be applied."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(
data=data,
feature_selection=True,
feature_selection_method=fs_method,
feature_selection_estimator="rf",
n_features_to_select=12,
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert X.shape[1] == 12
def test_custom_pipeline_is_list():
"""Assert that a custom pipeline can be provided as list."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(
data=data,
custom_pipeline=[("pca", PCA(n_components=5))],
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert X.shape[1] == 5
def test_custom_pipeline_is_pipeline():
"""Assert that a custom pipeline can be provided as a Pipeline object."""
data = pycaret.datasets.get_data("juice")
pc = pycaret.classification.setup(
data=data,
custom_pipeline=Pipeline(
[("scaler", StandardScaler()), ("pca", PCA(n_components=5))]
),
)
X, _ = pc.pipeline.transform(pc.X, pc.y)
assert X.shape[1] == 5
@pytest.mark.parametrize("pos", [-1, 0, 1])
def test_custom_pipeline_positions(pos):
"""Assert that a custom pipeline can be provided at a specific position."""
data = pycaret.datasets.get_data("cancer")
pc = pycaret.classification.setup(
data=data,
remove_outliers=True,
remove_multicollinearity=True,
custom_pipeline=[("scaler", StandardScaler())],
custom_pipeline_position=pos,
)
assert pc.pipeline.steps[pos][0] == "scaler"