Skip to content

Commit

Permalink
Merge pull request #34 from vaquierm/marine/BNB_Issue1
Browse files Browse the repository at this point in the history
implementation of bernouilli naive bayes
  • Loading branch information
hmarine authored Oct 16, 2019
2 parents a612e67 + 65f43c8 commit 4fa897c
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 8 deletions.
2 changes: 1 addition & 1 deletion src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# These are all the different vectorizers to run ("BINARY", "TFIDF")
vectorizers_to_run = ["TFIDF"]

# These are all the models to run and compare performance on a k fold cross validation ("LR", "NB", "MNNB", "KNN", "DT", "RF", "SVM", "SUPER")
# These are all the models to run and compare performance on a k fold cross validation ("LR", "NB", "NB_SKLEARN", "MNNB", "KNN", "DT", "RF", "SVM", "SUPER")
models_to_run = ["MNNB"]

# If this is true, run gridsearch on each model (This will significantly increase the runtime of the validation pipeline for model types that support gridsearch)
Expand Down
61 changes: 59 additions & 2 deletions src/models/NaiveBayes.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
import numpy as np

from sklearn.preprocessing import LabelBinarizer
from sklearn.utils.extmath import safe_sparse_dot
from src.models.Model import Model


class NaiveBayes(Model):

def __init__(self, alpha: float = 1):
if alpha < 0:
raise Exception("Alpha must be greater than zero")

self.alpha = alpha

def fit(self, X, Y):
"""
Fit the model with the training data
Expand All @@ -11,14 +21,61 @@ def fit(self, X, Y):
:return: None
"""
super().fit(X, Y)
# TODO: Marine Implement this
# https://github.com/vaquierm/RedditCommentTextClassification/issues/1

subreddits = np.unique(Y)

# fit the model
self.parameters = {}
total_per_class = []
thetak = [] # parameter theta k = nb comment of class 1 / total number of comments
alpha = 1

# compute theta k
# for each class
for i in range(len(subreddits)):
feature = subreddits[i]
numbExamples = 0

# loop through all the comments
for j in range(len(Y)):
if (Y[j] == feature):
numbExamples += 1

total_per_class.append(float(numbExamples))
thetak_i = float(numbExamples) / float(X.shape[0])
thetak.append(thetak_i)

binarizer = LabelBinarizer()
Y = binarizer.fit_transform(Y)

# parameter thate of kj using sparse matrices
# add 1 for Laplace Smoothing
kj_numerator = safe_sparse_dot(Y.T, X) + alpha
# kj_denominator == # of comments from that class
total_per_class = np.array(total_per_class)

# add 2 for Laplace Smoothing
kj_denominator = total_per_class.reshape(-1, 1) + 2*alpha

log_thetakj = np.log(kj_numerator) - np.log(kj_denominator)

self.parameters.update({'parameter_k': thetak})
self.parameters.update({'parameter_log_kj': log_thetakj})

def predict(self, X):
"""
Predict the labels based on the inputs
:param X: Inputs
:return: The predicted labels based on the training
"""
super().predict(X)
# TODO: Marine Implement this
log_one_minus_thatakj = np.log(1 - np.exp(self.parameters["parameter_log_kj"]))
first_summation = self.parameters["parameter_log_kj"] - log_one_minus_thatakj

first_term = np.log(self.parameters["parameter_k"])
second_term = safe_sparse_dot(X, first_summation.T)
third_term = log_one_minus_thatakj.sum(axis=1)
prediction = first_term + second_term + third_term

return np.argmax(prediction, axis=1)
4 changes: 3 additions & 1 deletion src/utils/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import MultinomialNB, BernoulliNB


def get_vectorizer(vectorizer_name):
Expand All @@ -33,6 +33,8 @@ def get_model(model_name: str, grid_search: bool = False):
return GridSearchCV(LogisticRegression(multi_class='auto'), param_grid, cv=5)
elif model_name == "NB":
return NaiveBayes()
elif model_name == "NB_SKLEARN":
return BernoulliNB()
elif model_name == "MNNB":
if not grid_search:
return MultinomialNB(alpha=0.0001)
Expand Down
8 changes: 4 additions & 4 deletions src/validation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,14 @@ def run_validation_pipeline(linear_correlation: bool = True):

print("\t\t\tThe best parameters for model: " + model_to_run + " are ", model.best_params_)
print("\t\t\tRunning k fold validation with the best model")
acc, conf_mat = k_fold_validation(model.best_estimator_, X_trains, X_tests, Y_trains, Y_tests, linear_correlation)
accuracy, conf_mat = k_fold_validation(model.best_estimator_, X_trains, X_tests, Y_trains, Y_tests, linear_correlation)

results_confusion_matrix_file = os.path.join(results_dir_path, vocabulary + "_"+ vec + "_" + model_to_run + "_" + "confusion.png")
save_confusion_matrix(conf_mat, "Confusion Matrix for vocabulary " + vocabulary + ", vectorizer " + vec + "and model " + model_to_run, list(map(lambda pred: int_to_subreddit[pred], unique_labels(Y))), results_confusion_matrix_file)
print("\t\t\t\tAccuracy of model " + model_to_run + ": ", acc)
print("\t\t\t\tAccuracy of model " + model_to_run + ": ", accuracy)

append_results(model_to_run + ": " + str(acc), results_data_file)
accuracies = accuracies.append(pd.DataFrame({"Model": [model_to_run], "Vectorizer": [vec], "Accuracy": [acc]}), ignore_index=True)
append_results(model_to_run + ": " + str(accuracy), results_data_file)
accuracies = accuracies.append(pd.DataFrame({"Model": [model_to_run], "Vectorizer": [vec], "Accuracy": [accuracy]}), ignore_index=True)

# save the accuracies of vocab for each model
results_model_accuracy_file = os.path.join(results_dir_path, "accuracies_" + vocabulary + ".png")
Expand Down

0 comments on commit 4fa897c

Please sign in to comment.