Skip to content

Commit

Permalink
Updates to handle the issue side of the prediction
Browse files Browse the repository at this point in the history
  • Loading branch information
Profir-Petru Partachi committed Dec 15, 2017
1 parent 1dc17a0 commit 474d628
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 50 deletions.
121 changes: 82 additions & 39 deletions Prediction/Linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@
from github import Github

from Prediction.feature_generation import generate_features
from Prediction.gitScraper import clone_git_repo_to_tmp, get_all_commit_hashes, process_a_commit
from Prediction.gitScraper import get_all_commit_hashes, process_a_commit
from Prediction.training_utils import train_classifier, generate_training_data, generate_dev_fingerprint, \
generate_tfidf, update, inflate_events, generate_batches, null_issue, flatten_events
generate_tfidf, update, inflate_events, generate_batches, null_issue, flatten_events, null_pr
from Util import utils_
from Util.ReservedKeywords import java_reserved, c_reserved, cpp_reserved, javascript_reserved, python_reserved
from Util.github_api_methods import parse_pr_ref, parse_issue_ref
from gitMine.VCClasses import IssueStates, Commit, Issue
from gitMine.VCClasses import IssueStates, Commit, Issue, PullRequest

stopwords = utils_.GitMineUtils.STOPWORDS \
+ list(set(java_reserved + c_reserved + cpp_reserved + javascript_reserved + python_reserved))
Expand Down Expand Up @@ -99,49 +99,92 @@ def fit(self, repository_obj, truth):
self.undersample_multiplicity, self.min_tok_len,
self.net_size_in_days))

def predict(self, prediction_pr):
predictions = list()
# Predict
prediction_pr.comments = prediction_pr.comments[:1]
open_issues = [i for i in self.repository_obj.issues
if
(len(i.states) == 0 or i.states[-1].to_ == IssueStates.open)
or
(min([abs(entity.timestamp - prediction_pr.comments[0].timestamp)
if entity.timestamp and prediction_pr.comments
else timedelta(days=self.net_size_in_days, seconds=1)
for entity in
[i.original_post]
+ i.states
+ i.actions]) <= timedelta(days=self.net_size_in_days))]
open_issues += [null_issue]
prediction_data = list()
for issue_ in open_issues:
prediction_data.append(generate_features(issue_, prediction_pr, stopwords, self.fingerprint,
self.dictionary, self.model, dict(), self.min_tok_len,
self.net_size_in_days))

for point in prediction_data:
probabilities = self.clf.predict_proba(np.array((point.engagement,
point.cosine_tt,
point.cosine,
point.lag,
point.lag_close,
point.lag_open,
point.pr_commits,)).reshape(1, -1))
prediction = (point.issue, float(probabilities[0][1]))
predictions.append(prediction)
predictions = sorted([p for p in predictions if p[1] >= self.prediction_threshold],
key=lambda p: (p[1], p[0]),
reverse=True)
return prediction_pr.number, predictions
def predict(self, prediction_object):
threshold = self.prediction_threshold
if isinstance(prediction_object, PullRequest):
predictions = list()
# Predict
prediction_object.comments = prediction_object.comments[:1]
open_issues = [i for i in self.repository_obj.issues
if
(len(i.states) == 0 or i.states[-1].to_ == IssueStates.open)
or
(min([abs(entity.timestamp - prediction_object.comments[0].timestamp)
if entity.timestamp and prediction_object.comments
else timedelta(days=self.net_size_in_days, seconds=1)
for entity in
[i.original_post]
+ i.states
+ i.actions]) <= timedelta(days=self.net_size_in_days))]
open_issues += [null_issue]
prediction_data = list()
for issue_ in open_issues:
prediction_data.append(generate_features(issue_, prediction_object, stopwords, self.fingerprint,
self.dictionary, self.model, dict(), self.min_tok_len,
self.net_size_in_days))

for point in prediction_data:
probabilities = self.clf.predict_proba(np.array((point.engagement,
point.cosine_tt,
point.cosine,
point.lag,
point.lag_close,
point.lag_open,
point.pr_commits,)).reshape(1, -1))
if point.pr == 'null_issue':
threshold = max(threshold, probabilities[0][1])
else:
prediction = (point.issue, float(probabilities[0][1]))
predictions.append(prediction)
predictions = sorted([p for p in predictions if p[1] >= threshold],
key=lambda p: (p[1], p[0]),
reverse=True)
return prediction_object.number, predictions
elif isinstance(prediction_object, Issue):
predictions = list()
# Predict
candidates = [p for p in self.repository_obj.prs
if
(min([abs(entity.timestamp - p.comments[0].timestamp)
if entity.timestamp and p.comments
else timedelta(days=self.net_size_in_days, seconds=1)
for entity in
[prediction_object.original_post]
+ prediction_object.states
+ prediction_object.actions]) <= timedelta(days=self.net_size_in_days))]
candidates += [null_pr]
prediction_data = list()
for pr_ in candidates:
prediction_data.append(generate_features(prediction_object, pr_, stopwords, self.fingerprint,
self.dictionary, self.model, dict(), self.min_tok_len,
self.net_size_in_days))

for point in prediction_data:
probabilities = self.clf.predict_proba(np.array((point.engagement,
point.cosine_tt,
point.cosine,
point.lag,
point.lag_close,
point.lag_open,
point.pr_commits,)).reshape(1, -1))
if point.pr == 'null_pr':
threshold = max(threshold, probabilities[0][1])
else:
prediction = (point.pr, float(probabilities[0][1]))
predictions.append(prediction)
predictions = sorted([p for p in predictions if p[1] >= threshold],
key=lambda p: (p[1], p[0]),
reverse=True)
return prediction_object.id_, predictions

def update_and_predict(self, event):
if isinstance(event[1], Commit):
if event[0] not in self.repository_obj.commits:
self.repository_obj.commits.append(event[0])
elif isinstance(event[1], Issue):
prediction = self.predict(event[1])
update(event, self.repository_obj.issues)
return prediction
else:
prediction = self.predict(event[1])
update(event, self.repository_obj.prs)
Expand Down
19 changes: 9 additions & 10 deletions backend/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from Prediction.Linker import Linker
import os

from Util.github_api_methods import parse_pr_ref
from Util.github_api_methods import parse_pr_ref, parse_issue_ref

models = list()
linkers = dict()
Expand Down Expand Up @@ -59,15 +59,14 @@ class RequestHandler(SimpleXMLRPCRequestHandler):
# Register an instance; all the methods of the instance are
# published as XML-RPC methods
class PredictionFunctions:
# TODO: Create issue symmetric case
# def predict_issue(self, project, issue_id):
# try:
# issue_ref = projects[project].get_issue(issue_id)
# issue = parse_issue_ref(issue_ref)
#
# return suggestions
# except KeyError:
# return None
def predict_issue(self, project, issue_id):
try:
issue_ref = projects[project].get_issue(int(issue_id))
issue = parse_issue_ref(issue_ref)
_, suggestions = linkers[project].update_and_predict((issue, issue))
return list(suggestions)
except KeyError:
return None

def predict_pr(self, project, pr_id):
try:
Expand Down
6 changes: 5 additions & 1 deletion chrome_entry/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,18 @@ def read_thread_func():
continue
repo = msg['Repository'].translate({ord(c): '_' for c in '\\/'})
pr_id = msg['PR']
issue_id = msg['Issue']
out_msg = '{"Suggestions": [], "Error": "Received data, loading model."}'
send_message(out_msg)

try:
out_msg = '{"Suggestions": [], "Error": "Model loaded, running predictions."}'
send_message(out_msg)
local_server = xmlrpc.client.ServerProxy('http://localhost:8000')
suggestions = local_server.predict_pr(repo, pr_id)
if pr_id:
suggestions = local_server.predict_pr(repo, pr_id)
elif issue_id:
suggestions = local_server.predict_issue(repo, issue_id)
# with open('debug.txt', 'w') as f:
# f.write('Got suggestions: %s' % str(suggestions))
if len(suggestions) > 0:
Expand Down

0 comments on commit 474d628

Please sign in to comment.