Skip to content

Commit

Permalink
Add option for more granular features
Browse files Browse the repository at this point in the history
Use UNKs and record UNKs in simulation
Make predictions only at salient events
  • Loading branch information
Profir-Petru Partachi committed Apr 26, 2018
1 parent 515bb8f commit 867ea4e
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 68 deletions.
84 changes: 55 additions & 29 deletions Prediction/Linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from Util.ReservedKeywords import java_reserved, c_reserved, cpp_reserved, javascript_reserved, python_reserved
from Util.github_api_methods import parse_pr_ref, parse_issue_ref
from Util.heuristic_methods import extract_issue_numbers
from gitMine.VCClasses import IssueStates, Commit, Issue, PullRequest
from gitMine.VCClasses import IssueStates, Commit, Issue, PullRequest, StateChange


def evaluate_at_threshold(result, th, truth):
Expand Down Expand Up @@ -127,6 +127,7 @@ def __init__(self, net_size_in_days, undersample_multiplicity, feature_config, p
self.model = None
self.dictionary = None
self.feature_generator = None
self.features = None
self.prediction_threshold = 1e-5
self.predictions_from_last_tf_idf_update = 0
self.predictions_between_updates = predictions_between_updates
Expand All @@ -147,9 +148,10 @@ def __init__(self, net_size_in_days, undersample_multiplicity, feature_config, p
assert self.min_tok_len is not None
assert self.stopwords is not None

def fit(self, repository_obj, truth):
def fit(self, repository_obj, truth, features=None):
self.repository_obj = repository_obj
self.truth = truth
self.features = features

similarity_config = None
temporal_config = None
Expand Down Expand Up @@ -181,6 +183,7 @@ def fit(self, repository_obj, truth):
similarity_config=similarity_config,
temporal_config=temporal_config,
text_cache=cache,
selected=self.features,
)
self.clf = train_classifier(generate_training_data(self.repository_obj,
self.feature_generator,
Expand Down Expand Up @@ -298,7 +301,8 @@ def predict(self, prediction_object):
use_issue_only=self.use_issue_only,
similarity_config=similarity_config,
temporal_config=temporal_config,
text_cache=new_cache
text_cache=new_cache,
selected=self.features,
)
return response

Expand All @@ -319,11 +323,17 @@ def update_from_flat_repo_and_predict(self, event):
if event[0] not in self.repository_obj.commits:
self.repository_obj.commits.append(event[0])
elif isinstance(event[1], Issue):
prediction = self.predict(event[1])
prediction = None
if len([i for i in self.repository_obj.issues if i.id_ == event[1].id_]) == 0 or \
(isinstance(event[0], StateChange) and event[0].to_ == IssueStates.closed):
prediction = self.predict(event[1])
update(event, self.repository_obj.issues)
return prediction
else:
prediction = self.predict(event[1])
prediction = None
if len([p for p in self.repository_obj.prs if i.id_ == event[1].id_]) == 0 or \
(isinstance(event[0], StateChange) and event[0].to_ == IssueStates.merged):
prediction = self.predict(event[1])
update(event, self.repository_obj.prs)
return prediction

Expand All @@ -342,28 +352,35 @@ def request_prediction(self, issue_or_pr):

def validate_over_suffix(self, suffix):
scores = list()
unk_rate = list()
for event in suffix:
result = self.update_from_flat_repo_and_predict(event)
if result:
scores.append(result)
id_, predictions = result
predictions = [t[0][len('issue_'):] for t in predictions[:5]]
id_ = id_[len('issue_'):]
UNKs = self.feature_generator.get_tf(self.feature_generator.via_text_cache(id_, event[1]))[-1][-1]
unk_rate.append(UNKs)
if isinstance(event[1], Issue):
for other in predictions:
try:
if ('#' + other) in self.truth['#' + id_]:
self.update_truth((id_, other))
else:
self.update_truth((id_, other), is_true=False)
except KeyError:
pass
self.update_truth((id_, other), is_true=False)
elif isinstance(event[1], PullRequest):
for other in predictions:
try:
if ('#' + id_) in self.truth['#' + other]:
self.update_truth((other, id_))
else:
self.update_truth((other, id_), is_true=False)
except KeyError:
pass
return scores
self.update_truth((other, id_), is_true=False)
return scores, unk_rate

def update_from_github(self, gh, since):
"""
Expand All @@ -385,6 +402,7 @@ def update_from_github(self, gh, since):
if pr_ref.number in pr_numbers:
old_pr = [pr for pr in self.repository_obj.prs if pr.number == pr_ref.number][0]
self.repository_obj.prs.remove(old_pr)
self.repository_obj.prs.append(pr)
try:
all_text = '\n'.join([c.body for c in pr.comments] + [c.title + c.desc for c in pr.commits])
issue_numbers = extract_issue_numbers(all_text)
Expand All @@ -393,7 +411,6 @@ def update_from_github(self, gh, since):
self.update_truth((issue_id[1:], pr.number[len('issue_'):]))
except TypeError:
pass
self.repository_obj.prs.append(pr)

issue_refs = [ref for ref in repo.get_issues(state='all', since=since)]
for issue_ref in issue_refs:
Expand All @@ -405,6 +422,7 @@ def update_from_github(self, gh, since):
if issue.id_ in issue_ids:
existing_issue = [i for i in self.repository_obj.issues if i.id_ == issue.id_][0]
self.repository_obj.issues.remove(existing_issue)
self.repository_obj.issues.append(issue)
try:
all_text = '\n'.join([c.body for c in pr.comments] + [c.title + c.desc for c in pr.commits])
issue_numbers = extract_issue_numbers(all_text)
Expand All @@ -413,7 +431,6 @@ def update_from_github(self, gh, since):
self.update_truth((issue.id_[len('issue_'):], pr_id))
except TypeError:
pass
self.repository_obj.issues.append(issue)

def update_from_local_git(self, git_location, since_sha):
"""
Expand All @@ -437,21 +454,26 @@ def update_from_local_git(self, git_location, since_sha):
if len([_ for _ in self.repository_obj.commits if _.c_hash.startswith(commit.c_hash)]) > 0:
self.repository_obj.commits.append(commit)

def update_truth(self, link):
def update_truth(self, link, is_true=True):
"""
Update the inner representation with a new link
:param link: a link tuple, issue id in first and pr number in second
:param is_true: If the link is a true link or if we only wish to do a model update
"""
if is_true:
try:
if ('#' + link[1]) not in self.truth['#' + link[0]]:
self.truth['#' + link[0]].append('#' + link[1])
except KeyError:
self.truth['#' + link[0]] = ['#' + link[1]]
try:
if ('#' + link[1]) not in self.truth['#' + link[0]]:
self.truth['#' + link[0]].append('#' + link[1])
except KeyError:
self.truth['#' + link[0]] = ['#' + link[1]]
point = self.feature_generator.generate_features(
[i for i in self.repository_obj.issues if i.id_[len('issue_'):] == link[0]][0],
[p for p in self.repository_obj.prs if p.number[len('issue_'):] == link[1]][0],
linked=True)
self.clf = self.clf.partial_fit([tuple([v for k, v in point.items() if k not in ['linked', 'issue', 'pr']])], [1])
point = self.feature_generator.generate_features(
[i for i in self.repository_obj.issues if i.id_[len('issue_'):] == link[0]][0],
[p for p in self.repository_obj.prs if p.number[len('issue_'):] == link[1]][0],
linked=True)
self.clf = self.clf.partial_fit([tuple([v for k, v in point.items() if k not in ['linked', 'issue', 'pr']])], [1])
except IndexError:
pass

def trim_truth(self):
"""
Expand Down Expand Up @@ -625,16 +647,16 @@ def load_from_disk(path):
projects = [
'PhilJay_MPAndroidChart',
# 'ReactiveX_RxJava',
# 'palantir_plottable',
'palantir_plottable',
# 'tensorflow_tensorflow',
]
config = {
'use_issue_only': False,
'use_issue_only': True,
'use_pr_only': True,
'use_temporal': True,
'use_sim_cs': False,
'use_sim_j': True,
'use_sim_d': True,
'use_sim_cs': True,
'use_sim_j': False,
'use_sim_d': False,
'use_file': True,
'use_social': True
}
Expand All @@ -650,6 +672,7 @@ def load_from_disk(path):
# ('report_size participants bounces existing_links ' if config['use_issue_only'] else '')
# features_string = features_string.strip().split(' ')
# index_feature_map = {i: features_string[i] for i in range(len(features_string))}
features = ['cosine_tc', 'report_size', 'branch_size', 'files_touched_by_pr', 'developer_normalised_lag']
stopwords = utils_.GitMineUtils.STOPWORDS \
+ list(set(java_reserved + c_reserved + cpp_reserved + javascript_reserved + python_reserved))
for project in projects:
Expand All @@ -663,18 +686,18 @@ def load_from_disk(path):

batches = generate_batches(repo, n_batches)
for i in [n_batches - 1]:
linker = Linker(net_size_in_days=14, min_tok_len=3, undersample_multiplicity=1, stopwords=stopwords,
linker = Linker(net_size_in_days=14, min_tok_len=3, undersample_multiplicity=1000, stopwords=stopwords,
feature_config=config, predictions_between_updates=1000)
training = list()
for j in range(n_batches - 1):
training += batches[j]
linker.fit(inflate_events(training, repo.langs, repo.name), truth)
linker.fit(inflate_events(training, repo.langs, repo.name), truth, features=features)
# forest = linker.clf
# importances = forest.feature_importances_
# std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
# pd.DataFrame(data={'Feature': features_string, 'Importance': importances, 'STD': std}) \
# .to_csv(project_dir[:-5] + ('_results_f%d_NullExplicit_UNKExplicit_FullFeatures_IMP.csv' % i))
scores = linker.validate_over_suffix(batches[i])
scores, unk_rate = linker.validate_over_suffix(batches[i])
scores_dict = dict()
for pr_id, predictions in scores:
try:
Expand All @@ -685,5 +708,8 @@ def load_from_disk(path):
scores_dict[pr_id] = list(scores_dict[pr_id])
scores_dict[pr_id] = sorted(scores_dict[pr_id], reverse=True, key=lambda p: (p[1], p[0]))

with open(project_dir[:-5] + ('_results_f%d_NullExplicit_UNKExplicit_FullFeatures.txt' % i), 'w') as f:
with open(project_dir[:-5] + ('_results_f%d_selected_features_MF.txt' % i), 'w') as f:
f.write(str(scores_dict))

with open(project_dir[:-5] + ('_unk_rate_f%d_selected_features_MF.txt' % i), 'w') as f:
f.write(str(unk_rate))
19 changes: 14 additions & 5 deletions Prediction/feature_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class FeatureGenerator(object):
Class that provides easy access to feature generation under a particular configuration
"""
def __init__(self, use_sim_cs, use_sim_j, use_social, use_temporal, use_file, use_pr_only, use_issue_only,
use_sim_d, similarity_config=None, temporal_config=None, text_cache=None):
use_sim_d, similarity_config=None, temporal_config=None, text_cache=None, selected=None):
self.use_sim_cs = use_sim_cs
self.use_sim_j = use_sim_j
self.use_sim_d = use_sim_d
Expand All @@ -80,6 +80,7 @@ def __init__(self, use_sim_cs, use_sim_j, use_social, use_temporal, use_file, us
self.use_pr_only = use_pr_only
self.use_issue_only = use_issue_only
self.text_cache = dict() if text_cache is None else text_cache
self.selected = selected if selected is not None else list()

def via_text_cache(self, key, obj, full=True):
text_preprocessor = text_pipeline if full else preprocess_text
Expand All @@ -90,6 +91,11 @@ def via_text_cache(self, key, obj, full=True):
text = self.text_cache[key]
return text

def get_tf(self, text):
tf = self.dictionary.doc2bow(text, return_missing=True)
tf = tf[0] + [(-1, sum(tf[1].values()))]
return tf

def generate_features(self, issue_: Issue, pr: PullRequest, linked: bool) -> Dict[str, Any]:
issue_id = issue_.id_
pr_id = pr.number
Expand All @@ -116,19 +122,20 @@ def generate_features(self, issue_: Issue, pr: PullRequest, linked: bool) -> Dic
cosine = cosine_similarity(pr_vector, i_vector)

pr_title_vector = np.zeros((len(self.dictionary.token2id) + 1,))
for index, value in self.model[self.dictionary.doc2bow(pr_title_text)]:

for index, value in self.model[self.get_tf(full_pr_text)]:
pr_title_vector[index] += value
pr_comment_vector = np.zeros((len(self.dictionary.token2id) + 1,))
try:
for index, value in self.model[self.dictionary.doc2bow(pr_desc_text)]:
for index, value in self.model[self.get_tf(pr_desc_text)]:
pr_comment_vector[index] += value
except IndexError:
pass
i_title_vector = np.zeros((len(self.dictionary.token2id) + 1,))
for index, value in self.model[self.dictionary.doc2bow(issue_title_text)]:
for index, value in self.model[self.get_tf(issue_title_text)]:
i_title_vector[index] += value
i_comment_vector = np.zeros((len(self.dictionary.token2id) + 1,))
for index, value in self.model[self.dictionary.doc2bow(issue_report_text)]:
for index, value in self.model[self.get_tf(issue_report_text)]:
i_comment_vector[index] += value

cosine_tt = cosine_similarity(pr_title_vector, i_title_vector)
Expand Down Expand Up @@ -254,4 +261,6 @@ def generate_features(self, issue_: Issue, pr: PullRequest, linked: bool) -> Dic
features['bounces'] = len([s.to_ == IssueStates.open for s in issue_.states])
features['existing_links'] = len(issue_.commits)

if self.selected:
features = {k: v for k, v in features.items() if k in ['pr', 'issue', 'linked'] + self.selected}
return features
4 changes: 2 additions & 2 deletions Prediction/training_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,10 +286,10 @@ def train_classifier(training_data_: List[Dict[str, Any]], perform_feature_selec
clf_ = Pipeline([
('feature_selection', SelectFromModel(RFE(
RandomForestClassifier(n_estimators=128, class_weight='balanced_subsample'), 5, step=1))),
('classification', MondrianForestClassifier(n_estimators=128, bootstrap=True))
('classification', MondrianForestClassifier(n_estimators=50,))
])
else:
clf_ = MondrianForestClassifier(n_estimators=128, bootstrap=True)
clf_ = MondrianForestClassifier(n_estimators=50,)
clf_.partial_fit(X, y)
return clf_

Expand Down
6 changes: 3 additions & 3 deletions Util/interpret_validation_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def evaluate_at_threshold(result, th, top_k, truth):

if __name__ == '__main__':
location_format = '../data/dev_set/%s.json'
n_fold = 1
n_fold = 5
projects = [
'PhilJay_MPAndroidChart',
# 'ReactiveX_RxJava',
Expand All @@ -79,8 +79,8 @@ def evaluate_at_threshold(result, th, top_k, truth):
]
for project in projects:
results = list()
for fold in range(n_fold):
with open((location_format[:-5] + '_results_f%d_NullExplicit_UNKExplicit_FullFeatures.txt') % (project, fold)) as f:
for fold in [n_fold - 1]:
with open((location_format[:-5] + '_results_f%d_selected_features_MF.txt') % (project, fold)) as f:
result_str = f.read()
result = ast.literal_eval(result_str)
results.append((fold, result))
Expand Down
Loading

0 comments on commit 867ea4e

Please sign in to comment.