Grid search implemented for some of the classes (#32)

* Added emotion feature * Added emotion feature fully working * Gridsearch implemented for LR and SVM * Added gridsearch for Multinomial NB
vaquierm · Oct 12, 2019 · e6ec4f1 · e6ec4f1
1 parent c057277
commit e6ec4f1
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 17 deletions.
diff --git a/src/config.py b/src/config.py
@@ -1,22 +1,23 @@
 # Contains the raw data downloaded from https://www.kaggle.com/c/reddit-comment-classification-comp-551/data
 raw_data_dir_path: str = "../data/raw_data"
-# Contains all the data in feature vector form
+# Contains all the cleaned processed raw data
 processed_dir_path: str = "../data/processed_data"
-# Contain csv files of different vocabularies
-vocabularies_dir_path: str = "../data/vocabularies"
 # Path to which scripts will dump data
 results_dir_path: str = "../results"
 
 # These are all the different dictionary names ("LEMMA", "STEM")
-vocabularies_to_run = ["LEMMA"]
+vocabularies_to_run = ["STEM", "LEMMA"]
 
 # These are all the different vectorizers to run ("BINARY", "TFIDF")
 vectorizers_to_run = ["TFIDF"]
 
 # These are all the models to run and compare performance on a k fold cross validation ("LR", "NB", "MNNB", "KNN", "DT", "RF", "SVM", "SUPER")
-models_to_run = ["MNNB", "LR", "SVM", "RF", "DT"]
+models_to_run = ["MNNB"]
+
+# If this is true, run gridsearch on each model (This will significantly increase the runtime of the validation pipeline for model types that support gridsearch)
+run_grid_search = True
 
 # Config to run for kaggle
-kaggle_vocab = "LEMMA"
+kaggle_vocab = "STEM"
 kaggle_vectorizer = "TFIDF"
 kaggle_model = "MNNB"
diff --git a/src/utils/factory.py b/src/utils/factory.py
@@ -1,4 +1,6 @@
+import numpy as np
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
+from sklearn.model_selection import GridSearchCV
 from src.models.SuperModel import SuperModel
 from src.models.NaiveBayes import NaiveBayes
 from sklearn.linear_model import LogisticRegression
@@ -19,21 +21,41 @@ def get_vectorizer(vectorizer_name):
     else:
         raise Exception("The type of vectorizer " + vectorizer_name + " is not known")
 
-def get_model(model_name: str):
+
+def get_model(model_name: str, grid_search: bool = False):
     if model_name == "LR":
-        return LogisticRegression(solver='lbfgs', multi_class='auto')
+        if not grid_search:
+            return LogisticRegression(multi_class='auto', solver='lbfgs', C=1.623776739188721, max_iter=200)
+        else:
+            param_grid = {
+                 'C': np.logspace(-4, 4, 20),
+                 'solver': ['saga', 'lbfgs']}
+            return GridSearchCV(LogisticRegression(multi_class='auto'), param_grid, cv=5)
     elif model_name == "NB":
         return NaiveBayes()
     elif model_name == "MNNB":
-        return MultinomialNB()
+        if not grid_search:
+            return MultinomialNB(alpha=0.28)
+        else:
+            param_grid = {
+                'alpha': np.arange(0.1, 0.5, 0.01).tolist()
+            }
+            return GridSearchCV(MultinomialNB(), param_grid, cv=5)
     elif model_name == "KNN":
         return KNeighborsClassifier()
     elif model_name == "DT":
         return DecisionTreeClassifier()
     elif model_name == "RF":
-        return RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced')
+        return RandomForestClassifier(n_estimators=500, random_state=0, class_weight='balanced')
     elif model_name == "SVM":
-        return SVC(kernel='linear', decision_function_shape='ovr', class_weight='balanced')
+        if not grid_search:
+            return SVC(kernel='linear', decision_function_shape='ovr', class_weight='balanced')
+        else:
+            param_grid = {
+                'kernel': ('linear', 'rbf'),
+                'C': [1, 10]
+            }
+            return GridSearchCV(SVC(decision_function_shape='ovr', class_weight='balanced'), param_grid, cv=5)
     elif model_name == "SUPER":
         return SuperModel()
     else:

diff --git a/src/validation_pipeline.py b/src/validation_pipeline.py
@@ -6,7 +6,7 @@
 from sklearn.metrics import accuracy_score
 
 
-from src.config import processed_dir_path, vocabularies_to_run, vectorizers_to_run, models_to_run
+from src.config import processed_dir_path, vocabularies_to_run, vectorizers_to_run, models_to_run, run_grid_search
 from src.utils.utils import get_training_feature_matrix
 from src.utils.factory import get_vectorizer, get_model
 
@@ -37,11 +37,20 @@ def run_validation_pipeline(mutual_info: bool = False):
                 X = remove_low_mutual_info_features(X, Y)
 
             for model_to_run in models_to_run:
-                print("\t\t\tRunning k fold validation on model: " + model_to_run)
-                model = get_model(model_to_run)
-
-                # For each model run kfold validation
-                Y_pred = k_fold_validation(model, X, Y)
+                model = get_model(model_to_run, run_grid_search)
+
+                if not run_grid_search or not 'GridSearch' in str(type(model)):
+                    print("\t\t\tRunning k fold validation on model: " + model_to_run)
+                    # For each model run kfold validation
+                    Y_pred = k_fold_validation(model, X, Y)
+                else:
+                    print("\t\t\tRunning grid search on model: " + model_to_run)
+                    # If we want to run gridsearh
+                    model.fit(X, Y)
+
+                    print("\t\t\tThe best parameters for model: " + model_to_run + " are ", model.best_params_)
+                    print("\t\t\tRunning k fold validation with the best model")
+                    Y_pred = k_fold_validation(model.best_estimator_, X, Y)
 
                 conf_mat = confusion_matrix(Y, Y_pred)
                 print("\t\t\t\tAccuracy of model " + model_to_run + ": ", accuracy_score(Y, Y_pred))