Skip to content

Commit

Permalink
Grid search implemented for some of the classes (#32)
Browse files Browse the repository at this point in the history
* Added emotion feature

* Added emotion feature fully working

* Gridsearch implemented for LR and SVM

* Added gridsearch for Multinomial NB
  • Loading branch information
vaquierm authored Oct 12, 2019
1 parent c057277 commit e6ec4f1
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 17 deletions.
13 changes: 7 additions & 6 deletions src/config.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
# Contains the raw data downloaded from https://www.kaggle.com/c/reddit-comment-classification-comp-551/data
raw_data_dir_path: str = "../data/raw_data"
# Contains all the data in feature vector form
# Contains all the cleaned processed raw data
processed_dir_path: str = "../data/processed_data"
# Contain csv files of different vocabularies
vocabularies_dir_path: str = "../data/vocabularies"
# Path to which scripts will dump data
results_dir_path: str = "../results"

# These are all the different dictionary names ("LEMMA", "STEM")
vocabularies_to_run = ["LEMMA"]
vocabularies_to_run = ["STEM", "LEMMA"]

# These are all the different vectorizers to run ("BINARY", "TFIDF")
vectorizers_to_run = ["TFIDF"]

# These are all the models to run and compare performance on a k fold cross validation ("LR", "NB", "MNNB", "KNN", "DT", "RF", "SVM", "SUPER")
models_to_run = ["MNNB", "LR", "SVM", "RF", "DT"]
models_to_run = ["MNNB"]

# If this is true, run gridsearch on each model (This will significantly increase the runtime of the validation pipeline for model types that support gridsearch)
run_grid_search = True

# Config to run for kaggle
kaggle_vocab = "LEMMA"
kaggle_vocab = "STEM"
kaggle_vectorizer = "TFIDF"
kaggle_model = "MNNB"
32 changes: 27 additions & 5 deletions src/utils/factory.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import GridSearchCV
from src.models.SuperModel import SuperModel
from src.models.NaiveBayes import NaiveBayes
from sklearn.linear_model import LogisticRegression
Expand All @@ -19,21 +21,41 @@ def get_vectorizer(vectorizer_name):
else:
raise Exception("The type of vectorizer " + vectorizer_name + " is not known")

def get_model(model_name: str):

def get_model(model_name: str, grid_search: bool = False):
if model_name == "LR":
return LogisticRegression(solver='lbfgs', multi_class='auto')
if not grid_search:
return LogisticRegression(multi_class='auto', solver='lbfgs', C=1.623776739188721, max_iter=200)
else:
param_grid = {
'C': np.logspace(-4, 4, 20),
'solver': ['saga', 'lbfgs']}
return GridSearchCV(LogisticRegression(multi_class='auto'), param_grid, cv=5)
elif model_name == "NB":
return NaiveBayes()
elif model_name == "MNNB":
return MultinomialNB()
if not grid_search:
return MultinomialNB(alpha=0.28)
else:
param_grid = {
'alpha': np.arange(0.1, 0.5, 0.01).tolist()
}
return GridSearchCV(MultinomialNB(), param_grid, cv=5)
elif model_name == "KNN":
return KNeighborsClassifier()
elif model_name == "DT":
return DecisionTreeClassifier()
elif model_name == "RF":
return RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced')
return RandomForestClassifier(n_estimators=500, random_state=0, class_weight='balanced')
elif model_name == "SVM":
return SVC(kernel='linear', decision_function_shape='ovr', class_weight='balanced')
if not grid_search:
return SVC(kernel='linear', decision_function_shape='ovr', class_weight='balanced')
else:
param_grid = {
'kernel': ('linear', 'rbf'),
'C': [1, 10]
}
return GridSearchCV(SVC(decision_function_shape='ovr', class_weight='balanced'), param_grid, cv=5)
elif model_name == "SUPER":
return SuperModel()
else:
Expand Down
21 changes: 15 additions & 6 deletions src/validation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sklearn.metrics import accuracy_score


from src.config import processed_dir_path, vocabularies_to_run, vectorizers_to_run, models_to_run
from src.config import processed_dir_path, vocabularies_to_run, vectorizers_to_run, models_to_run, run_grid_search
from src.utils.utils import get_training_feature_matrix
from src.utils.factory import get_vectorizer, get_model

Expand Down Expand Up @@ -37,11 +37,20 @@ def run_validation_pipeline(mutual_info: bool = False):
X = remove_low_mutual_info_features(X, Y)

for model_to_run in models_to_run:
print("\t\t\tRunning k fold validation on model: " + model_to_run)
model = get_model(model_to_run)

# For each model run kfold validation
Y_pred = k_fold_validation(model, X, Y)
model = get_model(model_to_run, run_grid_search)

if not run_grid_search or not 'GridSearch' in str(type(model)):
print("\t\t\tRunning k fold validation on model: " + model_to_run)
# For each model run kfold validation
Y_pred = k_fold_validation(model, X, Y)
else:
print("\t\t\tRunning grid search on model: " + model_to_run)
# If we want to run gridsearh
model.fit(X, Y)

print("\t\t\tThe best parameters for model: " + model_to_run + " are ", model.best_params_)
print("\t\t\tRunning k fold validation with the best model")
Y_pred = k_fold_validation(model.best_estimator_, X, Y)

conf_mat = confusion_matrix(Y, Y_pred)
print("\t\t\t\tAccuracy of model " + model_to_run + ": ", accuracy_score(Y, Y_pred))
Expand Down

0 comments on commit e6ec4f1

Please sign in to comment.