Updates to handle the issue side of the prediction

WuXiangChen · Dec 15, 2017 · 474d628 · 474d628
1 parent 1dc17a0
commit 474d628
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 50 deletions.
diff --git a/Prediction/Linker.py b/Prediction/Linker.py
@@ -11,13 +11,13 @@
 from github import Github
 
 from Prediction.feature_generation import generate_features
-from Prediction.gitScraper import clone_git_repo_to_tmp, get_all_commit_hashes, process_a_commit
+from Prediction.gitScraper import get_all_commit_hashes, process_a_commit
 from Prediction.training_utils import train_classifier, generate_training_data, generate_dev_fingerprint, \
-    generate_tfidf, update, inflate_events, generate_batches, null_issue, flatten_events
+    generate_tfidf, update, inflate_events, generate_batches, null_issue, flatten_events, null_pr
 from Util import utils_
 from Util.ReservedKeywords import java_reserved, c_reserved, cpp_reserved, javascript_reserved, python_reserved
 from Util.github_api_methods import parse_pr_ref, parse_issue_ref
-from gitMine.VCClasses import IssueStates, Commit, Issue
+from gitMine.VCClasses import IssueStates, Commit, Issue, PullRequest
 
 stopwords = utils_.GitMineUtils.STOPWORDS \
             + list(set(java_reserved + c_reserved + cpp_reserved + javascript_reserved + python_reserved))
@@ -99,49 +99,92 @@ def fit(self, repository_obj, truth):
                                                            self.undersample_multiplicity, self.min_tok_len,
                                                            self.net_size_in_days))
 
-    def predict(self, prediction_pr):
-        predictions = list()
-        # Predict
-        prediction_pr.comments = prediction_pr.comments[:1]
-        open_issues = [i for i in self.repository_obj.issues
-                       if
-                       (len(i.states) == 0 or i.states[-1].to_ == IssueStates.open)
-                       or
-                       (min([abs(entity.timestamp - prediction_pr.comments[0].timestamp)
-                             if entity.timestamp and prediction_pr.comments
-                             else timedelta(days=self.net_size_in_days, seconds=1)
-                             for entity in
-                             [i.original_post]
-                             + i.states
-                             + i.actions]) <= timedelta(days=self.net_size_in_days))]
-        open_issues += [null_issue]
-        prediction_data = list()
-        for issue_ in open_issues:
-            prediction_data.append(generate_features(issue_, prediction_pr, stopwords, self.fingerprint,
-                                                     self.dictionary, self.model, dict(), self.min_tok_len,
-                                                     self.net_size_in_days))
-
-        for point in prediction_data:
-            probabilities = self.clf.predict_proba(np.array((point.engagement,
-                                                             point.cosine_tt,
-                                                             point.cosine,
-                                                             point.lag,
-                                                             point.lag_close,
-                                                             point.lag_open,
-                                                             point.pr_commits,)).reshape(1, -1))
-            prediction = (point.issue, float(probabilities[0][1]))
-            predictions.append(prediction)
-        predictions = sorted([p for p in predictions if p[1] >= self.prediction_threshold],
-                             key=lambda p: (p[1], p[0]),
-                             reverse=True)
-        return prediction_pr.number, predictions
+    def predict(self, prediction_object):
+        threshold = self.prediction_threshold
+        if isinstance(prediction_object, PullRequest):
+            predictions = list()
+            # Predict
+            prediction_object.comments = prediction_object.comments[:1]
+            open_issues = [i for i in self.repository_obj.issues
+                           if
+                           (len(i.states) == 0 or i.states[-1].to_ == IssueStates.open)
+                           or
+                           (min([abs(entity.timestamp - prediction_object.comments[0].timestamp)
+                                 if entity.timestamp and prediction_object.comments
+                                 else timedelta(days=self.net_size_in_days, seconds=1)
+                                 for entity in
+                                 [i.original_post]
+                                 + i.states
+                                 + i.actions]) <= timedelta(days=self.net_size_in_days))]
+            open_issues += [null_issue]
+            prediction_data = list()
+            for issue_ in open_issues:
+                prediction_data.append(generate_features(issue_, prediction_object, stopwords, self.fingerprint,
+                                                         self.dictionary, self.model, dict(), self.min_tok_len,
+                                                         self.net_size_in_days))
+
+            for point in prediction_data:
+                probabilities = self.clf.predict_proba(np.array((point.engagement,
+                                                                 point.cosine_tt,
+                                                                 point.cosine,
+                                                                 point.lag,
+                                                                 point.lag_close,
+                                                                 point.lag_open,
+                                                                 point.pr_commits,)).reshape(1, -1))
+                if point.pr == 'null_issue':
+                    threshold = max(threshold, probabilities[0][1])
+                else:
+                    prediction = (point.issue, float(probabilities[0][1]))
+                    predictions.append(prediction)
+            predictions = sorted([p for p in predictions if p[1] >= threshold],
+                                 key=lambda p: (p[1], p[0]),
+                                 reverse=True)
+            return prediction_object.number, predictions
+        elif isinstance(prediction_object, Issue):
+            predictions = list()
+            # Predict
+            candidates = [p for p in self.repository_obj.prs
+                          if
+                          (min([abs(entity.timestamp - p.comments[0].timestamp)
+                                if entity.timestamp and p.comments
+                                else timedelta(days=self.net_size_in_days, seconds=1)
+                                for entity in
+                                [prediction_object.original_post]
+                                + prediction_object.states
+                                + prediction_object.actions]) <= timedelta(days=self.net_size_in_days))]
+            candidates += [null_pr]
+            prediction_data = list()
+            for pr_ in candidates:
+                prediction_data.append(generate_features(prediction_object, pr_, stopwords, self.fingerprint,
+                                                         self.dictionary, self.model, dict(), self.min_tok_len,
+                                                         self.net_size_in_days))
+
+            for point in prediction_data:
+                probabilities = self.clf.predict_proba(np.array((point.engagement,
+                                                                 point.cosine_tt,
+                                                                 point.cosine,
+                                                                 point.lag,
+                                                                 point.lag_close,
+                                                                 point.lag_open,
+                                                                 point.pr_commits,)).reshape(1, -1))
+                if point.pr == 'null_pr':
+                    threshold = max(threshold, probabilities[0][1])
+                else:
+                    prediction = (point.pr, float(probabilities[0][1]))
+                    predictions.append(prediction)
+            predictions = sorted([p for p in predictions if p[1] >= threshold],
+                                 key=lambda p: (p[1], p[0]),
+                                 reverse=True)
+            return prediction_object.id_, predictions
 
     def update_and_predict(self, event):
         if isinstance(event[1], Commit):
             if event[0] not in self.repository_obj.commits:
                 self.repository_obj.commits.append(event[0])
         elif isinstance(event[1], Issue):
+            prediction = self.predict(event[1])
             update(event, self.repository_obj.issues)
+            return prediction
         else:
             prediction = self.predict(event[1])
             update(event, self.repository_obj.prs)

diff --git a/backend/backend.py b/backend/backend.py
@@ -7,7 +7,7 @@
 from Prediction.Linker import Linker
 import os
 
-from Util.github_api_methods import parse_pr_ref
+from Util.github_api_methods import parse_pr_ref, parse_issue_ref
 
 models = list()
 linkers = dict()
@@ -59,15 +59,14 @@ class RequestHandler(SimpleXMLRPCRequestHandler):
     # Register an instance; all the methods of the instance are
     # published as XML-RPC methods
     class PredictionFunctions:
-        # TODO: Create issue symmetric case
-        # def predict_issue(self, project, issue_id):
-        #     try:
-        #         issue_ref = projects[project].get_issue(issue_id)
-        #         issue = parse_issue_ref(issue_ref)
-        #
-        #         return suggestions
-        #     except KeyError:
-        #         return None
+        def predict_issue(self, project, issue_id):
+            try:
+                issue_ref = projects[project].get_issue(int(issue_id))
+                issue = parse_issue_ref(issue_ref)
+                _, suggestions = linkers[project].update_and_predict((issue, issue))
+                return list(suggestions)
+            except KeyError:
+                return None
 
         def predict_pr(self, project, pr_id):
             try:

diff --git a/chrome_entry/__main__.py b/chrome_entry/__main__.py
@@ -34,14 +34,18 @@ def read_thread_func():
             continue
         repo = msg['Repository'].translate({ord(c): '_' for c in '\\/'})
         pr_id = msg['PR']
+        issue_id = msg['Issue']
         out_msg = '{"Suggestions": [], "Error": "Received data, loading model."}'
         send_message(out_msg)
 
         try:
             out_msg = '{"Suggestions": [], "Error": "Model loaded, running predictions."}'
             send_message(out_msg)
             local_server = xmlrpc.client.ServerProxy('http://localhost:8000')
-            suggestions = local_server.predict_pr(repo, pr_id)
+            if pr_id:
+                suggestions = local_server.predict_pr(repo, pr_id)
+            elif issue_id:
+                suggestions = local_server.predict_issue(repo, issue_id)
             # with open('debug.txt', 'w') as f:
             #     f.write('Got suggestions: %s' % str(suggestions))
             if len(suggestions) > 0: