Add files via upload

mridulkatta · Feb 5, 2020 · eaf9a7f · eaf9a7f
1 parent b76df7e
commit eaf9a7f
Show file tree

Hide file tree

Showing 8 changed files with 70 additions and 72 deletions.
diff --git a/README.md b/README.md
@@ -2,10 +2,10 @@
 PyCaret is end-to-end open source machine learning library for python programming language. Its primary objective is to reduce the cycle time of hypothesis to insights by providing an easy to use high level unified API. PyCaret's vision is to become defacto standard for teaching machine learning and data science. Our strength is in our easy to use unified interface for both supervised and unsupervised learning. It saves time and effort that citizen data scientists, students and researchers spent on coding or learning to code using different interfaces, so that now they can focus on business problem.
 
 ## Current Release
-The current release is beta 0.0.32 (as of 03/02/2020). A full release is targetted in the first week of February 2020.
+The current release is beta 0.0.33 (as of 04/02/2020). A full release is targetted in the first week of February 2020.
 
 ## Features Currently Available
-As per beta 0.0.32 following modules are generally available:
+As per beta 0.0.33 following modules are generally available:
 * pycaret.datasets <br/>
 * pycaret.classification (binary and multiclass) <br/>
 * pycaret.regression <br/>
@@ -31,7 +31,7 @@ pip install pycaret
 ```
 
 ## Quick Start
-As of beta 0.0.32 classification, regression, nlp, arules, anomaly and clustering modules are available.
+As of beta 0.0.33 classification, regression, nlp, arules, anomaly and clustering modules are available.
 
 ### Classification / Regression
 

diff --git a/anomaly.py b/anomaly.py
@@ -45,11 +45,11 @@ def setup(data,
         Example
         -------
         from pycaret.datasets import get_data
-        jewellery = get_data('jewellery')
+        anomaly = get_data('anomaly')
 
-        experiment_name = setup(data = jewellery, normalize = True)
+        experiment_name = setup(data = anomaly, normalize = True)
         
-        'jewellery' is a pandas Dataframe.
+        'anomaly' is a pandas Dataframe.
 
     Parameters
     ----------
@@ -301,7 +301,7 @@ def setup(data,
 
         for i in ord_keys:
             value_in_keys = ordinal_features.get(i)
-            value_in_data = list(data[i].unique())
+            value_in_data = list(data[i].unique().astype(str))
             for j in value_in_keys:
                 if j not in value_in_data:
                     text =  "Column name '" + str(i) + "' doesnt contain any level named '" + str(j) + "'."
@@ -846,6 +846,7 @@ def highlight_max(s):
 
 
 
+
 def create_model(model = None, 
                  fraction = 0.05,
                  verbose = True):

diff --git a/classification.py b/classification.py
@@ -3,7 +3,6 @@
 # License: MIT
 
 
-
 def setup(data,  
           target,   
           train_size = 0.7, 
@@ -419,7 +418,7 @@ def setup(data,
 
         for i in ord_keys:
             value_in_keys = ordinal_features.get(i)
-            value_in_data = list(data[i].unique())
+            value_in_data = list(data[i].unique().astype(str))
             for j in value_in_keys:
                 if j not in value_in_data:
                     text =  "Column name '" + str(i) + "' doesnt contain any level named '" + str(j) + "'."

diff --git a/clustering.py b/clustering.py
@@ -3,6 +3,7 @@
 # License: MIT
 
 
+
 def setup(data, 
           categorical_features = None,
           categorical_imputation = 'constant',
@@ -301,7 +302,7 @@ def setup(data,
 
         for i in ord_keys:
             value_in_keys = ordinal_features.get(i)
-            value_in_data = list(data[i].unique())
+            value_in_data = list(data[i].unique().astype(str))
             for j in value_in_keys:
                 if j not in value_in_data:
                     text =  "Column name '" + str(i) + "' doesnt contain any level named '" + str(j) + "'."

diff --git a/nlp.py b/nlp.py
@@ -3,7 +3,6 @@
 # License: MIT
 
 
-
 def setup(data, 
           target=None,
           custom_stopwords=None,
@@ -58,18 +57,10 @@ def setup(data,
 
     Warnings:
     ---------
-    
-    - If the dataset is large, Jupyter Notebook may return data update warnings due 
-      to the status bar. To switch off the warnings, you may consider the following 
-      code in your python terminal. 
-    
-         jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
-      
-      
     - Some functionalities in pycaret.nlp requires you to have english language model. 
       The language model is not downloaded automatically when you install pycaret. 
       You will have to download two models using your Anaconda Prompt or python 
-      command line interface.  To download the model, please type the following in 
+      command line interface. To download the model, please type the following in 
       your command line:
       
          python -m spacy download en_core_web_sm
@@ -152,8 +143,8 @@ def setup(data,
     except:
         max_sub = len(data)
 
-    sub_progress = ipw.IntProgress(value=0, min=0, max=max_sub, step=1, bar_style='', description='Sub Process: ')
-    display(sub_progress)
+    #sub_progress = ipw.IntProgress(value=0, min=0, max=max_sub, step=1, bar_style='', description='Sub Process: ')
+    #display(sub_progress)
 
     timestampStr = datetime.datetime.now().strftime("%H:%M:%S")
     monitor = pd.DataFrame( [ ['Initiated' , '. . . . . . . . . . . . . . . . . .', timestampStr ], 
@@ -265,9 +256,9 @@ def setup(data,
         review = re.sub("\d+", "", str(text[i]))
         text_step1.append(review)
 
-        sub_progress.value += 1
+        #sub_progress.value += 1
 
-    sub_progress.value = 0
+    #sub_progress.value = 0
 
     text = text_step1 #re-assigning
     del(text_step1)
@@ -293,9 +284,9 @@ def setup(data,
         review = re.sub(r'\s+', ' ', review)
         text_step2.append(review)
 
-        sub_progress.value += 1
+        #sub_progress.value += 1
 
-    sub_progress.value = 0
+    #sub_progress.value = 0
 
     text = text_step2 #re-assigning
     del(text_step2)
@@ -316,9 +307,9 @@ def setup(data,
         review = gensim.utils.simple_preprocess(str(i), deacc=True)
         text_step3.append(review)
 
-        sub_progress.value += 1
+        #sub_progress.value += 1
 
-    sub_progress.value = 0
+    #sub_progress.value = 0
 
     text = text_step3
     del(text_step3)
@@ -342,12 +333,12 @@ def setup(data,
                 ii.append(word)
         text_step4.append(ii)
 
-        sub_progress.value += 1
+        #sub_progress.value += 1
 
     text = text_step4
     del(text_step4)
 
-    sub_progress.value = 0
+    #sub_progress.value = 0
 
     progress.value += 1
 
@@ -366,12 +357,12 @@ def setup(data,
 
     for i in text:
         text_step5.append(bigram_mod[i])
-        sub_progress.value += 1
+        #sub_progress.value += 1
 
     text = text_step5
     del(text_step5)
 
-    sub_progress.value = 0
+    #sub_progress.value = 0
 
     progress.value += 1
 
@@ -390,9 +381,9 @@ def setup(data,
 
     for i in text:
         text_step6.append(trigram_mod[bigram_mod[i]])
-        sub_progress.value += 1
+        #sub_progress.value += 1
 
-    sub_progress.value = 0
+    #sub_progress.value = 0
 
     text = text_step6
     del(text_step6)
@@ -416,9 +407,9 @@ def setup(data,
         doc = nlp(" ".join(i))
         text_step7.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
 
-        sub_progress.value += 1
+        #sub_progress.value += 1
 
-    sub_progress.value = 0
+    #sub_progress.value = 0
     text = text_step7
     del(text_step7)
 
@@ -442,12 +433,12 @@ def setup(data,
                 ii.append(word)
         text_step8.append(ii)
 
-        sub_progress.value += 1
+        #sub_progress.value += 1
 
     text = text_step8
     del(text_step8)
 
-    sub_progress.value = 0
+    #sub_progress.value = 0
 
     progress.value += 1
 
@@ -470,9 +461,9 @@ def setup(data,
         d = id2word.doc2bow(i)
         corpus.append(d)
 
-        sub_progress.value += 1
+        #sub_progress.value += 1
 
-    sub_progress.value = 0
+    #sub_progress.value = 0
 
     progress.value += 1
 
@@ -522,6 +513,7 @@ def setup(data,
 
 
 
+
 def create_model(model=None,
                  multi_core=False,
                  num_topics = None,

diff --git a/preprocess.py b/preprocess.py
@@ -2102,17 +2102,23 @@ def Preprocess_Path_One(train_data,target_variable,ml_usecase=None,test_data =No
     Follwoing preprocess steps are taken:
       - 1) Auto infer data types 
       - 2) Impute (simple or with surrogate columns)
-      - 3) Drop categorical variables that have zero variance or near zero variance
-      - 4) Club categorical variables levels togather as a new level (other_infrequent) that are rare / at the bottom 5% of the variable distribution
-      - 5) Generate sub features from time feature such as 'month','weekday',is_month_end','is_month_start' & 'hour'
-      - 6) Scales & Power Transform (zscore,minmax,yeo-johnson,quantile,maxabs,robust) , including option to transform target variable
-      - 7) Apply binning to continious variable when numeric features are provided as a list 
-      - 8) Detect & remove outliers using isolation forest, knn and PCA
-      - 9) Apply clusters to segment entire data
-      -10) One Hot / Dummy encoding
-      -11) Remove special characters from column names such as commas, square brackets etc to make it competible with jason dependednt models
-      -12) Fix multicollinearity
-      -13) Apply diamension reduction techniques such as pca_liner, pca_kernal, incremental, tsne & pls
+      - 3) Ordinal Encoder
+      - 4) Drop categorical variables that have zero variance or near zero variance
+      - 5) Club categorical variables levels togather as a new level (other_infrequent) that are rare / at the bottom 5% of the variable distribution
+      - 6) Club unseen levels in test dataset with most/least frequent levels in train dataset 
+      - 7) Generate sub features from time feature such as 'month','weekday',is_month_end','is_month_start' & 'hour'
+      - 8) Group features by calculating min, max, mean, median & sd of similar features
+      - 9) Make nonliner features (polynomial, sin , cos & tan)
+      -10) Scales & Power Transform (zscore,minmax,yeo-johnson,quantile,maxabs,robust) , including option to transform target variable
+      -11) Apply binning to continious variable when numeric features are provided as a list 
+      -12) Detect & remove outliers using isolation forest, knn and PCA
+      -13) Apply clusters to segment entire data
+      -14) One Hot / Dummy encoding
+      -15) Remove special characters from column names such as commas, square brackets etc to make it competible with jason dependednt models
+      -16) Feature Selection throuh Random Forest , LightGBM and Pearson Correlation
+      -17) Fix multicollinearity
+      -18) Feature Interaction (DFS) , multiply , divided , add and substract features
+      -19) Apply diamension reduction techniques such as pca_liner, pca_kernal, incremental, tsne 
           - except for pca_liner, all other method only takes number of component (as integer) i.e no variance explaination metohd available  
   '''
   global c2, subcase
@@ -2272,10 +2278,10 @@ def Preprocess_Path_One(train_data,target_variable,ml_usecase=None,test_data =No
   pipe = Pipeline([
                  ('dtypes',dtypes),
                  ('imputer',imputer),
+                 ('ordinal',ordinal),
                  ('znz',znz),
                  ('club_R_L',club_R_L),
                  ('new_levels',new_levels),
-                 ('ordinal',ordinal),
                  ('feature_time',feature_time),
                  ('group',group),
                  ('nonliner',nonliner),
@@ -2321,20 +2327,23 @@ def Preprocess_Path_Two(train_data,ml_usecase=None,test_data =None,categorical_f
 
   '''
     Follwoing preprocess steps are taken:
-      - THIS IS BUILt FOR UNSUPERVISED LEARNING , FOLLOWES SAME PATH AS Path_One
+      - THIS IS BUILt FOR UNSUPERVISED LEARNING
       - 1) Auto infer data types 
       - 2) Impute (simple or with surrogate columns)
-      - 3) Drop categorical variables that have zero variance or near zero variance
-      - 4) Club categorical variables levels togather as a new level (other_infrequent) that are rare / at the bottom 5% of the variable distribution
-      - 5) Generate sub features from time feature such as 'month','weekday',is_month_end','is_month_start' & 'hour'
-      - 6) Scales & Power Transform (zscore,minmax,yeo-johnson,quantile,maxabs,robust) , including option to transform target variable
-      - 7) Apply binning to continious variable when numeric features are provided as a list 
-      - 8) Detect & remove outliers using isolation forest, knn and PCA
-      - 9) One Hot / Dummy encoding
-      -10) Remove special characters from column names such as commas, square brackets etc to make it competible with jason dependednt models
-      -11) Fix multicollinearity
-      -12) Apply diamension reduction techniques such as pca_liner, pca_kernal, incremental, tsne & pls
-          - except for pca_liner, all other method only takes number of component (as integer) i.e no variance explaination metohd available  
+      - 3) Ordinal Encoder
+      - 4) Drop categorical variables that have zero variance or near zero variance
+      - 5) Club categorical variables levels togather as a new level (other_infrequent) that are rare / at the bottom 5% of the variable distribution
+      - 6) Club unseen levels in test dataset with most/least frequent levels in train dataset 
+      - 7) Generate sub features from time feature such as 'month','weekday',is_month_end','is_month_start' & 'hour'
+      - 8) Group features by calculating min, max, mean, median & sd of similar features
+      - 9) Scales & Power Transform (zscore,minmax,yeo-johnson,quantile,maxabs,robust) , including option to transform target variable
+      -10) Apply binning to continious variable when numeric features are provided as a list 
+      -11) Detect & remove outliers using isolation forest, knn and PCA
+      -12) One Hot / Dummy encoding
+      -13) Remove special characters from column names such as commas, square brackets etc to make it competible with jason dependednt models
+      -14) Fix multicollinearity
+      -15) Apply diamension reduction techniques such as pca_liner, pca_kernal, incremental, tsne 
+          - except for pca_liner, all other method only takes number of component (as integer) i.e no variance explaination metohd available 
   '''
 
   # just make a dummy target variable
@@ -2454,10 +2463,10 @@ def Preprocess_Path_Two(train_data,ml_usecase=None,test_data =None,categorical_f
   pipe = Pipeline([
                  ('dtypes',dtypes),
                  ('imputer',imputer),
+                 ('ordinal',ordinal),
                  ('znz',znz),
                  ('club_R_L',club_R_L),
                  ('new_levels',new_levels),
-                 ('ordinal',ordinal),
                  ('feature_time',feature_time),
                  ('group',group),
                  ('scaling',scaling),
@@ -2476,4 +2485,4 @@ def Preprocess_Path_Two(train_data,ml_usecase=None,test_data =None,categorical_f
     return(train_t.drop(target_variable,axis=1),test_t)
   else:
     train_t = pipe.fit_transform(train_data)
-    return(train_t.drop(target_variable,axis=1))
+    return(train_t.drop(target_variable,axis=1))
diff --git a/regression.py b/regression.py
@@ -3,7 +3,6 @@
 # License: MIT
 
 
-
 def setup(data, 
           target, 
           train_size=0.7,
@@ -427,7 +426,7 @@ def setup(data,
 
         for i in ord_keys:
             value_in_keys = ordinal_features.get(i)
-            value_in_data = list(data[i].unique())
+            value_in_data = list(data[i].unique().astype(str))
             for j in value_in_keys:
                 if j not in value_in_data:
                     text =  "Column name '" + str(i) + "' doesnt contain any level named '" + str(j) + "'."
@@ -1443,9 +1442,6 @@ def highlight_max(s):
 
 
 
-
-
-
 def create_model(estimator = None, 
                  ensemble = False, 
                  method = None, 

diff --git a/setup.py b/setup.py
@@ -27,7 +27,7 @@ def readme():
 
 setup(
     name="pycaret",
-    version="0.0.32",
+    version="0.0.33",
     description="A Python package for supervised and unsupervised machine learning.",
     long_description=readme(),
     long_description_content_type="text/markdown",