Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
PyCaret authored Feb 5, 2020
1 parent b76df7e commit eaf9a7f
Show file tree
Hide file tree
Showing 8 changed files with 70 additions and 72 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
PyCaret is end-to-end open source machine learning library for python programming language. Its primary objective is to reduce the cycle time of hypothesis to insights by providing an easy to use high level unified API. PyCaret's vision is to become defacto standard for teaching machine learning and data science. Our strength is in our easy to use unified interface for both supervised and unsupervised learning. It saves time and effort that citizen data scientists, students and researchers spent on coding or learning to code using different interfaces, so that now they can focus on business problem.

## Current Release
The current release is beta 0.0.32 (as of 03/02/2020). A full release is targetted in the first week of February 2020.
The current release is beta 0.0.33 (as of 04/02/2020). A full release is targetted in the first week of February 2020.

## Features Currently Available
As per beta 0.0.32 following modules are generally available:
As per beta 0.0.33 following modules are generally available:
* pycaret.datasets <br/>
* pycaret.classification (binary and multiclass) <br/>
* pycaret.regression <br/>
Expand All @@ -31,7 +31,7 @@ pip install pycaret
```

## Quick Start
As of beta 0.0.32 classification, regression, nlp, arules, anomaly and clustering modules are available.
As of beta 0.0.33 classification, regression, nlp, arules, anomaly and clustering modules are available.

### Classification / Regression

Expand Down
9 changes: 5 additions & 4 deletions anomaly.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@ def setup(data,
Example
-------
from pycaret.datasets import get_data
jewellery = get_data('jewellery')
anomaly = get_data('anomaly')
experiment_name = setup(data = jewellery, normalize = True)
experiment_name = setup(data = anomaly, normalize = True)
'jewellery' is a pandas Dataframe.
'anomaly' is a pandas Dataframe.
Parameters
----------
Expand Down Expand Up @@ -301,7 +301,7 @@ def setup(data,

for i in ord_keys:
value_in_keys = ordinal_features.get(i)
value_in_data = list(data[i].unique())
value_in_data = list(data[i].unique().astype(str))
for j in value_in_keys:
if j not in value_in_data:
text = "Column name '" + str(i) + "' doesnt contain any level named '" + str(j) + "'."
Expand Down Expand Up @@ -846,6 +846,7 @@ def highlight_max(s):




def create_model(model = None,
fraction = 0.05,
verbose = True):
Expand Down
3 changes: 1 addition & 2 deletions classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# License: MIT



def setup(data,
target,
train_size = 0.7,
Expand Down Expand Up @@ -419,7 +418,7 @@ def setup(data,

for i in ord_keys:
value_in_keys = ordinal_features.get(i)
value_in_data = list(data[i].unique())
value_in_data = list(data[i].unique().astype(str))
for j in value_in_keys:
if j not in value_in_data:
text = "Column name '" + str(i) + "' doesnt contain any level named '" + str(j) + "'."
Expand Down
3 changes: 2 additions & 1 deletion clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# License: MIT



def setup(data,
categorical_features = None,
categorical_imputation = 'constant',
Expand Down Expand Up @@ -301,7 +302,7 @@ def setup(data,

for i in ord_keys:
value_in_keys = ordinal_features.get(i)
value_in_data = list(data[i].unique())
value_in_data = list(data[i].unique().astype(str))
for j in value_in_keys:
if j not in value_in_data:
text = "Column name '" + str(i) + "' doesnt contain any level named '" + str(j) + "'."
Expand Down
52 changes: 22 additions & 30 deletions nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# License: MIT



def setup(data,
target=None,
custom_stopwords=None,
Expand Down Expand Up @@ -58,18 +57,10 @@ def setup(data,
Warnings:
---------
- If the dataset is large, Jupyter Notebook may return data update warnings due
to the status bar. To switch off the warnings, you may consider the following
code in your python terminal.
jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
- Some functionalities in pycaret.nlp requires you to have english language model.
The language model is not downloaded automatically when you install pycaret.
You will have to download two models using your Anaconda Prompt or python
command line interface. To download the model, please type the following in
command line interface. To download the model, please type the following in
your command line:
python -m spacy download en_core_web_sm
Expand Down Expand Up @@ -152,8 +143,8 @@ def setup(data,
except:
max_sub = len(data)

sub_progress = ipw.IntProgress(value=0, min=0, max=max_sub, step=1, bar_style='', description='Sub Process: ')
display(sub_progress)
#sub_progress = ipw.IntProgress(value=0, min=0, max=max_sub, step=1, bar_style='', description='Sub Process: ')
#display(sub_progress)

timestampStr = datetime.datetime.now().strftime("%H:%M:%S")
monitor = pd.DataFrame( [ ['Initiated' , '. . . . . . . . . . . . . . . . . .', timestampStr ],
Expand Down Expand Up @@ -265,9 +256,9 @@ def setup(data,
review = re.sub("\d+", "", str(text[i]))
text_step1.append(review)

sub_progress.value += 1
#sub_progress.value += 1

sub_progress.value = 0
#sub_progress.value = 0

text = text_step1 #re-assigning
del(text_step1)
Expand All @@ -293,9 +284,9 @@ def setup(data,
review = re.sub(r'\s+', ' ', review)
text_step2.append(review)

sub_progress.value += 1
#sub_progress.value += 1

sub_progress.value = 0
#sub_progress.value = 0

text = text_step2 #re-assigning
del(text_step2)
Expand All @@ -316,9 +307,9 @@ def setup(data,
review = gensim.utils.simple_preprocess(str(i), deacc=True)
text_step3.append(review)

sub_progress.value += 1
#sub_progress.value += 1

sub_progress.value = 0
#sub_progress.value = 0

text = text_step3
del(text_step3)
Expand All @@ -342,12 +333,12 @@ def setup(data,
ii.append(word)
text_step4.append(ii)

sub_progress.value += 1
#sub_progress.value += 1

text = text_step4
del(text_step4)

sub_progress.value = 0
#sub_progress.value = 0

progress.value += 1

Expand All @@ -366,12 +357,12 @@ def setup(data,

for i in text:
text_step5.append(bigram_mod[i])
sub_progress.value += 1
#sub_progress.value += 1

text = text_step5
del(text_step5)

sub_progress.value = 0
#sub_progress.value = 0

progress.value += 1

Expand All @@ -390,9 +381,9 @@ def setup(data,

for i in text:
text_step6.append(trigram_mod[bigram_mod[i]])
sub_progress.value += 1
#sub_progress.value += 1

sub_progress.value = 0
#sub_progress.value = 0

text = text_step6
del(text_step6)
Expand All @@ -416,9 +407,9 @@ def setup(data,
doc = nlp(" ".join(i))
text_step7.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])

sub_progress.value += 1
#sub_progress.value += 1

sub_progress.value = 0
#sub_progress.value = 0
text = text_step7
del(text_step7)

Expand All @@ -442,12 +433,12 @@ def setup(data,
ii.append(word)
text_step8.append(ii)

sub_progress.value += 1
#sub_progress.value += 1

text = text_step8
del(text_step8)

sub_progress.value = 0
#sub_progress.value = 0

progress.value += 1

Expand All @@ -470,9 +461,9 @@ def setup(data,
d = id2word.doc2bow(i)
corpus.append(d)

sub_progress.value += 1
#sub_progress.value += 1

sub_progress.value = 0
#sub_progress.value = 0

progress.value += 1

Expand Down Expand Up @@ -522,6 +513,7 @@ def setup(data,




def create_model(model=None,
multi_core=False,
num_topics = None,
Expand Down
61 changes: 35 additions & 26 deletions preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -2102,17 +2102,23 @@ def Preprocess_Path_One(train_data,target_variable,ml_usecase=None,test_data =No
Follwoing preprocess steps are taken:
- 1) Auto infer data types
- 2) Impute (simple or with surrogate columns)
- 3) Drop categorical variables that have zero variance or near zero variance
- 4) Club categorical variables levels togather as a new level (other_infrequent) that are rare / at the bottom 5% of the variable distribution
- 5) Generate sub features from time feature such as 'month','weekday',is_month_end','is_month_start' & 'hour'
- 6) Scales & Power Transform (zscore,minmax,yeo-johnson,quantile,maxabs,robust) , including option to transform target variable
- 7) Apply binning to continious variable when numeric features are provided as a list
- 8) Detect & remove outliers using isolation forest, knn and PCA
- 9) Apply clusters to segment entire data
-10) One Hot / Dummy encoding
-11) Remove special characters from column names such as commas, square brackets etc to make it competible with jason dependednt models
-12) Fix multicollinearity
-13) Apply diamension reduction techniques such as pca_liner, pca_kernal, incremental, tsne & pls
- 3) Ordinal Encoder
- 4) Drop categorical variables that have zero variance or near zero variance
- 5) Club categorical variables levels togather as a new level (other_infrequent) that are rare / at the bottom 5% of the variable distribution
- 6) Club unseen levels in test dataset with most/least frequent levels in train dataset
- 7) Generate sub features from time feature such as 'month','weekday',is_month_end','is_month_start' & 'hour'
- 8) Group features by calculating min, max, mean, median & sd of similar features
- 9) Make nonliner features (polynomial, sin , cos & tan)
-10) Scales & Power Transform (zscore,minmax,yeo-johnson,quantile,maxabs,robust) , including option to transform target variable
-11) Apply binning to continious variable when numeric features are provided as a list
-12) Detect & remove outliers using isolation forest, knn and PCA
-13) Apply clusters to segment entire data
-14) One Hot / Dummy encoding
-15) Remove special characters from column names such as commas, square brackets etc to make it competible with jason dependednt models
-16) Feature Selection throuh Random Forest , LightGBM and Pearson Correlation
-17) Fix multicollinearity
-18) Feature Interaction (DFS) , multiply , divided , add and substract features
-19) Apply diamension reduction techniques such as pca_liner, pca_kernal, incremental, tsne
- except for pca_liner, all other method only takes number of component (as integer) i.e no variance explaination metohd available
'''
global c2, subcase
Expand Down Expand Up @@ -2272,10 +2278,10 @@ def Preprocess_Path_One(train_data,target_variable,ml_usecase=None,test_data =No
pipe = Pipeline([
('dtypes',dtypes),
('imputer',imputer),
('ordinal',ordinal),
('znz',znz),
('club_R_L',club_R_L),
('new_levels',new_levels),
('ordinal',ordinal),
('feature_time',feature_time),
('group',group),
('nonliner',nonliner),
Expand Down Expand Up @@ -2321,20 +2327,23 @@ def Preprocess_Path_Two(train_data,ml_usecase=None,test_data =None,categorical_f

'''
Follwoing preprocess steps are taken:
- THIS IS BUILt FOR UNSUPERVISED LEARNING , FOLLOWES SAME PATH AS Path_One
- THIS IS BUILt FOR UNSUPERVISED LEARNING
- 1) Auto infer data types
- 2) Impute (simple or with surrogate columns)
- 3) Drop categorical variables that have zero variance or near zero variance
- 4) Club categorical variables levels togather as a new level (other_infrequent) that are rare / at the bottom 5% of the variable distribution
- 5) Generate sub features from time feature such as 'month','weekday',is_month_end','is_month_start' & 'hour'
- 6) Scales & Power Transform (zscore,minmax,yeo-johnson,quantile,maxabs,robust) , including option to transform target variable
- 7) Apply binning to continious variable when numeric features are provided as a list
- 8) Detect & remove outliers using isolation forest, knn and PCA
- 9) One Hot / Dummy encoding
-10) Remove special characters from column names such as commas, square brackets etc to make it competible with jason dependednt models
-11) Fix multicollinearity
-12) Apply diamension reduction techniques such as pca_liner, pca_kernal, incremental, tsne & pls
- except for pca_liner, all other method only takes number of component (as integer) i.e no variance explaination metohd available
- 3) Ordinal Encoder
- 4) Drop categorical variables that have zero variance or near zero variance
- 5) Club categorical variables levels togather as a new level (other_infrequent) that are rare / at the bottom 5% of the variable distribution
- 6) Club unseen levels in test dataset with most/least frequent levels in train dataset
- 7) Generate sub features from time feature such as 'month','weekday',is_month_end','is_month_start' & 'hour'
- 8) Group features by calculating min, max, mean, median & sd of similar features
- 9) Scales & Power Transform (zscore,minmax,yeo-johnson,quantile,maxabs,robust) , including option to transform target variable
-10) Apply binning to continious variable when numeric features are provided as a list
-11) Detect & remove outliers using isolation forest, knn and PCA
-12) One Hot / Dummy encoding
-13) Remove special characters from column names such as commas, square brackets etc to make it competible with jason dependednt models
-14) Fix multicollinearity
-15) Apply diamension reduction techniques such as pca_liner, pca_kernal, incremental, tsne
- except for pca_liner, all other method only takes number of component (as integer) i.e no variance explaination metohd available
'''

# just make a dummy target variable
Expand Down Expand Up @@ -2454,10 +2463,10 @@ def Preprocess_Path_Two(train_data,ml_usecase=None,test_data =None,categorical_f
pipe = Pipeline([
('dtypes',dtypes),
('imputer',imputer),
('ordinal',ordinal),
('znz',znz),
('club_R_L',club_R_L),
('new_levels',new_levels),
('ordinal',ordinal),
('feature_time',feature_time),
('group',group),
('scaling',scaling),
Expand All @@ -2476,4 +2485,4 @@ def Preprocess_Path_Two(train_data,ml_usecase=None,test_data =None,categorical_f
return(train_t.drop(target_variable,axis=1),test_t)
else:
train_t = pipe.fit_transform(train_data)
return(train_t.drop(target_variable,axis=1))
return(train_t.drop(target_variable,axis=1))
6 changes: 1 addition & 5 deletions regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# License: MIT



def setup(data,
target,
train_size=0.7,
Expand Down Expand Up @@ -427,7 +426,7 @@ def setup(data,

for i in ord_keys:
value_in_keys = ordinal_features.get(i)
value_in_data = list(data[i].unique())
value_in_data = list(data[i].unique().astype(str))
for j in value_in_keys:
if j not in value_in_data:
text = "Column name '" + str(i) + "' doesnt contain any level named '" + str(j) + "'."
Expand Down Expand Up @@ -1443,9 +1442,6 @@ def highlight_max(s):






def create_model(estimator = None,
ensemble = False,
method = None,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def readme():

setup(
name="pycaret",
version="0.0.32",
version="0.0.33",
description="A Python package for supervised and unsupervised machine learning.",
long_description=readme(),
long_description_content_type="text/markdown",
Expand Down

0 comments on commit eaf9a7f

Please sign in to comment.