Merge pull request #34 from vaquierm/marine/BNB_Issue1

hmarine · web-flow · commit 4fa897c73e11 · 2019-10-15T22:52:22.000-04:00
implementation of bernouilli naive bayes
diff --git a/src/config.py b/src/config.py
@@ -13,7 +13,7 @@
 # These are all the different vectorizers to run ("BINARY", "TFIDF")
 vectorizers_to_run = ["TFIDF"]
 
-# These are all the models to run and compare performance on a k fold cross validation ("LR", "NB", "MNNB", "KNN", "DT", "RF", "SVM", "SUPER")
+# These are all the models to run and compare performance on a k fold cross validation ("LR", "NB", "NB_SKLEARN", "MNNB", "KNN", "DT", "RF", "SVM", "SUPER")
 models_to_run = ["MNNB"]
 
 # If this is true, run gridsearch on each model (This will significantly increase the runtime of the validation pipeline for model types that support gridsearch)
diff --git a/src/models/NaiveBayes.py b/src/models/NaiveBayes.py
@@ -1,8 +1,18 @@
+import numpy as np
+
+from sklearn.preprocessing import LabelBinarizer
+from sklearn.utils.extmath import safe_sparse_dot
 from src.models.Model import Model
 
 
 class NaiveBayes(Model):
 
+    def __init__(self, alpha: float = 1):
+        if alpha < 0:
+            raise Exception("Alpha must be greater than zero")
+
+        self.alpha = alpha
+
     def fit(self, X, Y):
         """
         Fit the model with the training data
@@ -11,14 +21,61 @@ def fit(self, X, Y):
         :return: None
         """
         super().fit(X, Y)
-        # TODO: Marine Implement this
         # https://github.com/vaquierm/RedditCommentTextClassification/issues/1
 
+        subreddits = np.unique(Y)
+
+        # fit the model
+        self.parameters = {}
+        total_per_class = []
+        thetak = []  # parameter theta k = nb comment of class 1 / total number of comments
+        alpha = 1
+
+        # compute theta k
+        # for each class
+        for i in range(len(subreddits)):
+            feature = subreddits[i]
+            numbExamples = 0
+
+            # loop through all the comments
+            for j in range(len(Y)):
+                if (Y[j] == feature):
+                    numbExamples += 1
+
+            total_per_class.append(float(numbExamples))
+            thetak_i = float(numbExamples) / float(X.shape[0])
+            thetak.append(thetak_i)
+
+        binarizer = LabelBinarizer()
+        Y = binarizer.fit_transform(Y)
+
+        # parameter thate of kj using sparse matrices
+        # add 1 for Laplace Smoothing
+        kj_numerator = safe_sparse_dot(Y.T, X) + alpha
+        # kj_denominator == # of comments from that class
+        total_per_class = np.array(total_per_class)
+
+        # add 2 for Laplace Smoothing
+        kj_denominator = total_per_class.reshape(-1, 1) + 2*alpha
+
+        log_thetakj = np.log(kj_numerator) - np.log(kj_denominator)
+
+        self.parameters.update({'parameter_k': thetak})
+        self.parameters.update({'parameter_log_kj': log_thetakj})
+
     def predict(self, X):
         """
         Predict the labels based on the inputs
         :param X: Inputs
         :return: The predicted labels based on the training
         """
         super().predict(X)
-        # TODO: Marine Implement this
+        log_one_minus_thatakj = np.log(1 - np.exp(self.parameters["parameter_log_kj"]))
+        first_summation = self.parameters["parameter_log_kj"] - log_one_minus_thatakj
+
+        first_term = np.log(self.parameters["parameter_k"])
+        second_term = safe_sparse_dot(X, first_summation.T)
+        third_term = log_one_minus_thatakj.sum(axis=1)
+        prediction = first_term + second_term + third_term
+
+        return np.argmax(prediction, axis=1)
diff --git a/src/utils/factory.py b/src/utils/factory.py
@@ -8,7 +8,7 @@
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.neighbors import KNeighborsClassifier
-from sklearn.naive_bayes import MultinomialNB
+from sklearn.naive_bayes import MultinomialNB, BernoulliNB
 
 
 def get_vectorizer(vectorizer_name):
@@ -33,6 +33,8 @@ def get_model(model_name: str, grid_search: bool = False):
             return GridSearchCV(LogisticRegression(multi_class='auto'), param_grid, cv=5)
     elif model_name == "NB":
         return NaiveBayes()
+    elif model_name == "NB_SKLEARN":
+        return BernoulliNB()
     elif model_name == "MNNB":
         if not grid_search:
             return MultinomialNB(alpha=0.0001)
diff --git a/src/validation_pipeline.py b/src/validation_pipeline.py
@@ -62,14 +62,14 @@ def run_validation_pipeline(linear_correlation: bool = True):
 
                     print("\t\t\tThe best parameters for model: " + model_to_run + " are ", model.best_params_)
                     print("\t\t\tRunning k fold validation with the best model")
-                    acc, conf_mat = k_fold_validation(model.best_estimator_, X_trains, X_tests, Y_trains, Y_tests, linear_correlation)
+                    accuracy, conf_mat = k_fold_validation(model.best_estimator_, X_trains, X_tests, Y_trains, Y_tests, linear_correlation)
 
                 results_confusion_matrix_file = os.path.join(results_dir_path, vocabulary + "_"+ vec + "_" + model_to_run + "_" + "confusion.png")
                 save_confusion_matrix(conf_mat, "Confusion Matrix for vocabulary " + vocabulary + ", vectorizer " + vec + "and model " + model_to_run, list(map(lambda pred: int_to_subreddit[pred], unique_labels(Y))), results_confusion_matrix_file)
-                print("\t\t\t\tAccuracy of model " + model_to_run + ": ", acc)
+                print("\t\t\t\tAccuracy of model " + model_to_run + ": ", accuracy)
 
-                append_results(model_to_run + ": " + str(acc), results_data_file)
-                accuracies = accuracies.append(pd.DataFrame({"Model": [model_to_run], "Vectorizer": [vec], "Accuracy": [acc]}), ignore_index=True)
+                append_results(model_to_run + ": " + str(accuracy), results_data_file)
+                accuracies = accuracies.append(pd.DataFrame({"Model": [model_to_run], "Vectorizer": [vec], "Accuracy": [accuracy]}), ignore_index=True)
 
         # save the accuracies of vocab for each model
         results_model_accuracy_file = os.path.join(results_dir_path, "accuracies_" + vocabulary + ".png")