Add option for more granular features

Use UNKs and record UNKs in simulation Make predictions only at salient events
WuXiangChen · Apr 26, 2018 · 867ea4e · 867ea4e
1 parent 515bb8f
commit 867ea4e
Show file tree

Hide file tree

Showing 6 changed files with 110 additions and 68 deletions.
diff --git a/Prediction/Linker.py b/Prediction/Linker.py
@@ -20,7 +20,7 @@
 from Util.ReservedKeywords import java_reserved, c_reserved, cpp_reserved, javascript_reserved, python_reserved
 from Util.github_api_methods import parse_pr_ref, parse_issue_ref
 from Util.heuristic_methods import extract_issue_numbers
-from gitMine.VCClasses import IssueStates, Commit, Issue, PullRequest
+from gitMine.VCClasses import IssueStates, Commit, Issue, PullRequest, StateChange
 
 
 def evaluate_at_threshold(result, th, truth):
@@ -127,6 +127,7 @@ def __init__(self, net_size_in_days, undersample_multiplicity, feature_config, p
         self.model = None
         self.dictionary = None
         self.feature_generator = None
+        self.features = None
         self.prediction_threshold = 1e-5
         self.predictions_from_last_tf_idf_update = 0
         self.predictions_between_updates = predictions_between_updates
@@ -147,9 +148,10 @@ def __init__(self, net_size_in_days, undersample_multiplicity, feature_config, p
             assert self.min_tok_len is not None
             assert self.stopwords is not None
 
-    def fit(self, repository_obj, truth):
+    def fit(self, repository_obj, truth, features=None):
         self.repository_obj = repository_obj
         self.truth = truth
+        self.features = features
 
         similarity_config = None
         temporal_config = None
@@ -181,6 +183,7 @@ def fit(self, repository_obj, truth):
             similarity_config=similarity_config,
             temporal_config=temporal_config,
             text_cache=cache,
+            selected=self.features,
         )
         self.clf = train_classifier(generate_training_data(self.repository_obj,
                                                            self.feature_generator,
@@ -298,7 +301,8 @@ def predict(self, prediction_object):
                     use_issue_only=self.use_issue_only,
                     similarity_config=similarity_config,
                     temporal_config=temporal_config,
-                    text_cache=new_cache
+                    text_cache=new_cache,
+                    selected=self.features,
                 )
         return response
 
@@ -319,11 +323,17 @@ def update_from_flat_repo_and_predict(self, event):
             if event[0] not in self.repository_obj.commits:
                 self.repository_obj.commits.append(event[0])
         elif isinstance(event[1], Issue):
-            prediction = self.predict(event[1])
+            prediction = None
+            if len([i for i in self.repository_obj.issues if i.id_ == event[1].id_]) == 0 or \
+                (isinstance(event[0], StateChange) and event[0].to_ == IssueStates.closed):
+                prediction = self.predict(event[1])
             update(event, self.repository_obj.issues)
             return prediction
         else:
-            prediction = self.predict(event[1])
+            prediction = None
+            if len([p for p in self.repository_obj.prs if i.id_ == event[1].id_]) == 0 or \
+                (isinstance(event[0], StateChange) and event[0].to_ == IssueStates.merged):
+                prediction = self.predict(event[1])
             update(event, self.repository_obj.prs)
             return prediction
 
@@ -342,28 +352,35 @@ def request_prediction(self, issue_or_pr):
 
     def validate_over_suffix(self, suffix):
         scores = list()
+        unk_rate = list()
         for event in suffix:
             result = self.update_from_flat_repo_and_predict(event)
             if result:
                 scores.append(result)
                 id_, predictions = result
                 predictions = [t[0][len('issue_'):] for t in predictions[:5]]
                 id_ = id_[len('issue_'):]
+                UNKs = self.feature_generator.get_tf(self.feature_generator.via_text_cache(id_, event[1]))[-1][-1]
+                unk_rate.append(UNKs)
                 if isinstance(event[1], Issue):
                     for other in predictions:
                         try:
                             if ('#' + other) in self.truth['#' + id_]:
                                 self.update_truth((id_, other))
+                            else:
+                                self.update_truth((id_, other), is_true=False)
                         except KeyError:
-                            pass
+                            self.update_truth((id_, other), is_true=False)
                 elif isinstance(event[1], PullRequest):
                     for other in predictions:
                         try:
                             if ('#' + id_) in self.truth['#' + other]:
                                 self.update_truth((other, id_))
+                            else:
+                                self.update_truth((other, id_), is_true=False)
                         except KeyError:
-                            pass
-        return scores
+                            self.update_truth((other, id_), is_true=False)
+        return scores, unk_rate
 
     def update_from_github(self, gh, since):
         """
@@ -385,6 +402,7 @@ def update_from_github(self, gh, since):
             if pr_ref.number in pr_numbers:
                 old_pr = [pr for pr in self.repository_obj.prs if pr.number == pr_ref.number][0]
                 self.repository_obj.prs.remove(old_pr)
+            self.repository_obj.prs.append(pr)
             try:
                 all_text = '\n'.join([c.body for c in pr.comments] + [c.title + c.desc for c in pr.commits])
                 issue_numbers = extract_issue_numbers(all_text)
@@ -393,7 +411,6 @@ def update_from_github(self, gh, since):
                     self.update_truth((issue_id[1:], pr.number[len('issue_'):]))
             except TypeError:
                 pass
-            self.repository_obj.prs.append(pr)
 
         issue_refs = [ref for ref in repo.get_issues(state='all', since=since)]
         for issue_ref in issue_refs:
@@ -405,6 +422,7 @@ def update_from_github(self, gh, since):
             if issue.id_ in issue_ids:
                 existing_issue = [i for i in self.repository_obj.issues if i.id_ == issue.id_][0]
                 self.repository_obj.issues.remove(existing_issue)
+            self.repository_obj.issues.append(issue)
             try:
                 all_text = '\n'.join([c.body for c in pr.comments] + [c.title + c.desc for c in pr.commits])
                 issue_numbers = extract_issue_numbers(all_text)
@@ -413,7 +431,6 @@ def update_from_github(self, gh, since):
                     self.update_truth((issue.id_[len('issue_'):], pr_id))
             except TypeError:
                 pass
-            self.repository_obj.issues.append(issue)
 
     def update_from_local_git(self, git_location, since_sha):
         """
@@ -437,21 +454,26 @@ def update_from_local_git(self, git_location, since_sha):
             if len([_ for _ in self.repository_obj.commits if _.c_hash.startswith(commit.c_hash)]) > 0:
                 self.repository_obj.commits.append(commit)
 
-    def update_truth(self, link):
+    def update_truth(self, link, is_true=True):
         """
         Update the inner representation with a new link
         :param link: a link tuple, issue id in first and pr number in second
+        :param is_true: If the link is a true link or if we only wish to do a model update
         """
+        if is_true:
+            try:
+                if ('#' + link[1]) not in self.truth['#' + link[0]]:
+                    self.truth['#' + link[0]].append('#' + link[1])
+            except KeyError:
+                self.truth['#' + link[0]] = ['#' + link[1]]
         try:
-            if ('#' + link[1]) not in self.truth['#' + link[0]]:
-                self.truth['#' + link[0]].append('#' + link[1])
-        except KeyError:
-            self.truth['#' + link[0]] = ['#' + link[1]]
-        point = self.feature_generator.generate_features(
-                [i for i in self.repository_obj.issues if i.id_[len('issue_'):] == link[0]][0],
-                [p for p in self.repository_obj.prs if p.number[len('issue_'):] == link[1]][0],
-                linked=True)
-        self.clf = self.clf.partial_fit([tuple([v for k, v in point.items() if k not in ['linked', 'issue', 'pr']])], [1])
+            point = self.feature_generator.generate_features(
+                    [i for i in self.repository_obj.issues if i.id_[len('issue_'):] == link[0]][0],
+                    [p for p in self.repository_obj.prs if p.number[len('issue_'):] == link[1]][0],
+                    linked=True)
+            self.clf = self.clf.partial_fit([tuple([v for k, v in point.items() if k not in ['linked', 'issue', 'pr']])], [1])
+        except IndexError:
+            pass
 
     def trim_truth(self):
         """
@@ -625,16 +647,16 @@ def load_from_disk(path):
     projects = [
         'PhilJay_MPAndroidChart',
         # 'ReactiveX_RxJava',
-        # 'palantir_plottable',
+        'palantir_plottable',
         # 'tensorflow_tensorflow',
     ]
     config = {
-        'use_issue_only': False,
+        'use_issue_only': True,
         'use_pr_only': True,
         'use_temporal': True,
-        'use_sim_cs': False,
-        'use_sim_j': True,
-        'use_sim_d': True,
+        'use_sim_cs': True,
+        'use_sim_j': False,
+        'use_sim_d': False,
         'use_file': True,
         'use_social': True
     }
@@ -650,6 +672,7 @@ def load_from_disk(path):
     #     ('report_size participants bounces existing_links ' if config['use_issue_only'] else '')
     # features_string = features_string.strip().split(' ')
     # index_feature_map = {i: features_string[i] for i in range(len(features_string))}
+    features = ['cosine_tc', 'report_size', 'branch_size', 'files_touched_by_pr', 'developer_normalised_lag']
     stopwords = utils_.GitMineUtils.STOPWORDS \
                 + list(set(java_reserved + c_reserved + cpp_reserved + javascript_reserved + python_reserved))
     for project in projects:
@@ -663,18 +686,18 @@ def load_from_disk(path):
 
         batches = generate_batches(repo, n_batches)
         for i in [n_batches - 1]:
-            linker = Linker(net_size_in_days=14, min_tok_len=3, undersample_multiplicity=1, stopwords=stopwords,
+            linker = Linker(net_size_in_days=14, min_tok_len=3, undersample_multiplicity=1000, stopwords=stopwords,
                             feature_config=config, predictions_between_updates=1000)
             training = list()
             for j in range(n_batches - 1):
                 training += batches[j]
-            linker.fit(inflate_events(training, repo.langs, repo.name), truth)
+            linker.fit(inflate_events(training, repo.langs, repo.name), truth, features=features)
             # forest = linker.clf
             # importances = forest.feature_importances_
             # std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
             # pd.DataFrame(data={'Feature': features_string, 'Importance': importances, 'STD': std}) \
             #     .to_csv(project_dir[:-5] + ('_results_f%d_NullExplicit_UNKExplicit_FullFeatures_IMP.csv' % i))
-            scores = linker.validate_over_suffix(batches[i])
+            scores, unk_rate = linker.validate_over_suffix(batches[i])
             scores_dict = dict()
             for pr_id, predictions in scores:
                 try:
@@ -685,5 +708,8 @@ def load_from_disk(path):
                 scores_dict[pr_id] = list(scores_dict[pr_id])
                 scores_dict[pr_id] = sorted(scores_dict[pr_id], reverse=True, key=lambda p: (p[1], p[0]))
 
-            with open(project_dir[:-5] + ('_results_f%d_NullExplicit_UNKExplicit_FullFeatures.txt' % i), 'w') as f:
+            with open(project_dir[:-5] + ('_results_f%d_selected_features_MF.txt' % i), 'w') as f:
                 f.write(str(scores_dict))
+
+            with open(project_dir[:-5] + ('_unk_rate_f%d_selected_features_MF.txt' % i), 'w') as f:
+                f.write(str(unk_rate))
diff --git a/Prediction/feature_generation.py b/Prediction/feature_generation.py
@@ -60,7 +60,7 @@ class FeatureGenerator(object):
     Class that provides easy access to feature generation under a particular configuration
     """
     def __init__(self, use_sim_cs, use_sim_j, use_social, use_temporal, use_file, use_pr_only, use_issue_only,
-                 use_sim_d, similarity_config=None, temporal_config=None, text_cache=None):
+                 use_sim_d, similarity_config=None, temporal_config=None, text_cache=None, selected=None):
         self.use_sim_cs = use_sim_cs
         self.use_sim_j = use_sim_j
         self.use_sim_d = use_sim_d
@@ -80,6 +80,7 @@ def __init__(self, use_sim_cs, use_sim_j, use_social, use_temporal, use_file, us
         self.use_pr_only = use_pr_only
         self.use_issue_only = use_issue_only
         self.text_cache = dict() if text_cache is None else text_cache
+        self.selected = selected if selected is not None else list()
 
     def via_text_cache(self, key, obj, full=True):
         text_preprocessor = text_pipeline if full else preprocess_text
@@ -90,6 +91,11 @@ def via_text_cache(self, key, obj, full=True):
             text = self.text_cache[key]
         return text
 
+    def get_tf(self, text):
+        tf = self.dictionary.doc2bow(text, return_missing=True)
+        tf = tf[0] + [(-1, sum(tf[1].values()))]
+        return tf
+
     def generate_features(self, issue_: Issue, pr: PullRequest, linked: bool) -> Dict[str, Any]:
         issue_id = issue_.id_
         pr_id = pr.number
@@ -116,19 +122,20 @@ def generate_features(self, issue_: Issue, pr: PullRequest, linked: bool) -> Dic
             cosine = cosine_similarity(pr_vector, i_vector)
 
             pr_title_vector = np.zeros((len(self.dictionary.token2id) + 1,))
-            for index, value in self.model[self.dictionary.doc2bow(pr_title_text)]:
+
+            for index, value in self.model[self.get_tf(full_pr_text)]:
                 pr_title_vector[index] += value
             pr_comment_vector = np.zeros((len(self.dictionary.token2id) + 1,))
             try:
-                for index, value in self.model[self.dictionary.doc2bow(pr_desc_text)]:
+                for index, value in self.model[self.get_tf(pr_desc_text)]:
                     pr_comment_vector[index] += value
             except IndexError:
                 pass
             i_title_vector = np.zeros((len(self.dictionary.token2id) + 1,))
-            for index, value in self.model[self.dictionary.doc2bow(issue_title_text)]:
+            for index, value in self.model[self.get_tf(issue_title_text)]:
                 i_title_vector[index] += value
             i_comment_vector = np.zeros((len(self.dictionary.token2id) + 1,))
-            for index, value in self.model[self.dictionary.doc2bow(issue_report_text)]:
+            for index, value in self.model[self.get_tf(issue_report_text)]:
                 i_comment_vector[index] += value
 
             cosine_tt = cosine_similarity(pr_title_vector, i_title_vector)
@@ -254,4 +261,6 @@ def generate_features(self, issue_: Issue, pr: PullRequest, linked: bool) -> Dic
             features['bounces'] = len([s.to_ == IssueStates.open for s in issue_.states])
             features['existing_links'] = len(issue_.commits)
 
+        if self.selected:
+            features = {k: v for k, v in features.items() if k in ['pr', 'issue', 'linked'] + self.selected}
         return features
diff --git a/Prediction/training_utils.py b/Prediction/training_utils.py
@@ -286,10 +286,10 @@ def train_classifier(training_data_: List[Dict[str, Any]], perform_feature_selec
         clf_ = Pipeline([
           ('feature_selection', SelectFromModel(RFE(
               RandomForestClassifier(n_estimators=128, class_weight='balanced_subsample'), 5, step=1))),
-          ('classification', MondrianForestClassifier(n_estimators=128, bootstrap=True))
+          ('classification', MondrianForestClassifier(n_estimators=50,))
         ])
     else:
-        clf_ = MondrianForestClassifier(n_estimators=128, bootstrap=True)
+        clf_ = MondrianForestClassifier(n_estimators=50,)
     clf_.partial_fit(X, y)
     return clf_
 

diff --git a/Util/interpret_validation_results.py b/Util/interpret_validation_results.py
@@ -68,7 +68,7 @@ def evaluate_at_threshold(result, th, top_k, truth):
 
 if __name__ == '__main__':
     location_format = '../data/dev_set/%s.json'
-    n_fold = 1
+    n_fold = 5
     projects = [
         'PhilJay_MPAndroidChart',
         # 'ReactiveX_RxJava',
@@ -79,8 +79,8 @@ def evaluate_at_threshold(result, th, top_k, truth):
     ]
     for project in projects:
         results = list()
-        for fold in range(n_fold):
-            with open((location_format[:-5] + '_results_f%d_NullExplicit_UNKExplicit_FullFeatures.txt') % (project, fold)) as f:
+        for fold in [n_fold - 1]:
+            with open((location_format[:-5] + '_results_f%d_selected_features_MF.txt') % (project, fold)) as f:
                 result_str = f.read()
             result = ast.literal_eval(result_str)
             results.append((fold, result))