Add the description of the main classes and the main functions. This …

…make the code readable.
qianshuqinghan · Jun 18, 2018 · 5f2cb8d · 5f2cb8d
1 parent c2e86a9
commit 5f2cb8d
Show file tree

Hide file tree

Showing 12 changed files with 150 additions and 60 deletions.
diff --git a/FAP/DataContainer/DataContainer.py b/FAP/DataContainer/DataContainer.py
@@ -1,3 +1,8 @@
+'''.
+Jun 17, 2018.
+Yang SONG, [email protected]
+'''
+
 import numpy as np
 import os
 import pandas as pd
@@ -6,6 +11,10 @@
 
 
 class DataContainer:
+    '''
+    DataContainer is the key class of the FAP project. It is the node to connect different models. Almost all procesors
+    accept DataContainer and return a new DataContainer.
+    '''
     def __init__(self, array=np.array([]), label=np.array([]), feature_name=[], case_name=[]):
         self.__feature_name = feature_name
         self.__case_name = case_name
@@ -121,6 +130,12 @@ def UsualAndL2Normalize(self, store_path='', axis=0):
             df.to_csv(store_path)
 
     def ArtefactNormalize(self, normalization_file):
+        '''
+        This function can use the existing file with the infoamtion of the normalization. It is usually used on the fact
+        that a learnt model is used to process the testing data set.
+        :param normalization_file: the stored file with the information of the normalization.
+        :return:
+        '''
         df = pd.read_csv(normalization_file, header=0, index_col=0)
         mean_value = df.loc['mean'].values
         std_value = df.loc['std'].values

diff --git a/FAP/DataContainer/DataSeperate.py b/FAP/DataContainer/DataSeperate.py
@@ -1,12 +1,16 @@
+'''.
+Jun 17, 2018.
+Yang SONG, [email protected]
+'''
+
 import numpy as np
 from random import shuffle
 import os
 import pandas as pd
 
-
 from FAP.DataContainer.DataContainer import DataContainer
 
-def SeperateDataToTrainingAndTesting(data, percentage=0.2, label=np.array(()), training_index_list = [], store_folder=''):
+def SeperateDataToTrainingAndTesting(data, testing_percentage=0.2, label=np.array(()), training_index_list = [], store_folder=''):
     is_label = True
     if label.size == 0:
         label = np.zeros((data.shape[0]), )
@@ -19,8 +23,8 @@ def SeperateDataToTrainingAndTesting(data, percentage=0.2, label=np.array(()), t
             index = np.where(label == group)[0]
 
             shuffle(index)
-            testing_index = index[:round(len(index) * percentage)]
-            training_index = index[round(len(index) * percentage):]
+            testing_index = index[:round(len(index) * testing_percentage)]
+            training_index = index[round(len(index) * testing_percentage):]
 
             training_index_list.extend(training_index)
             testing_index_list.extend(testing_index)
@@ -55,7 +59,16 @@ def SeperateDataToTrainingAndTesting(data, percentage=0.2, label=np.array(()), t
                 'training_index': training_index_list,
                 'testing_index': testing_index_list}
 
-def GenerateTrainingAndTestingData(csv_file_path, training_index=[], percentage=0.3, is_store_index=False):
+def GenerateTrainingAndTestingData(csv_file_path, training_index=[], testing_percentage=0.3, is_store_index=False):
+    '''
+    Seperate the data container into training part and the testing part.
+    :param csv_file_path: The file path of the data container
+    :param training_index: The index of the training data set. This is usually to compare with different combination
+    of the sequences. Default is []
+    :param testing_percentage: The percentage of data set is used to separate for testing data set. Default is 30%
+    :param is_store_index: To store or not. Default is False.
+    :return:
+    '''
     data_container = DataContainer()
     data, label, feature_name, case_name = data_container.LoadAndGetData(csv_file_path)
     folder_path = os.path.split(csv_file_path)[0]
@@ -73,7 +86,7 @@ def GenerateTrainingAndTestingData(csv_file_path, training_index=[], percentage=
     else:
         store_folder = ''
 
-    output = SeperateDataToTrainingAndTesting(data, percentage, label, training_index_list=training_index, store_folder=store_folder)
+    output = SeperateDataToTrainingAndTesting(data, testing_percentage, label, training_index_list=training_index, store_folder=store_folder)
 
     training_data_contrainer = DataContainer(output['training_data'], output['training_label'], feature_name,
                                              [case_name[temp] for temp in output['training_index']])
@@ -84,9 +97,8 @@ def GenerateTrainingAndTestingData(csv_file_path, training_index=[], percentage=
     testing_data_contrainer.Save(os.path.join(testing_folder, 'numeric_feature.csv'))
 
 
-# Demo
 if __name__ == '__main__':
-    GenerateTrainingAndTestingData(r'C:\MyCode\PythonScript\EyeEnt\lymphoma_MM\T1C_T2\T1_T2', percentage=0.3)
+    GenerateTrainingAndTestingData(r'C:\MyCode\PythonScript\EyeEnt\lymphoma_MM\T1C_T2\T1_T2', testing_percentage=0.3)
     training_index = pd.read_csv(r'C:\MyCode\PythonScript\EyeEnt\lymphoma_MM\T1C_T2\T1_T2\training\training_index.csv')
     training_index = training_index.values[:, 1].tolist()
     GenerateTrainingAndTestingData(r'C:\MyCode\PythonScript\EyeEnt\lymphoma_MM\T1C_T2\T1', training_index=training_index)
@@ -97,4 +109,4 @@ def GenerateTrainingAndTestingData(csv_file_path, training_index=[], percentage=
     training_index = pd.read_csv(r'C:\MyCode\PythonScript\EyeEnt\lymphoma_MM\T1C_T2\T1\training\training_index.csv')
     print(training_index.values[:, 1].tolist())
     training_index = pd.read_csv(r'C:\MyCode\PythonScript\EyeEnt\lymphoma_MM\T1C_T2\T2\training\training_index.csv')
-    print(training_index.values[:, 1].tolist())
+    print(training_index.values[:, 1].tolist())
diff --git a/FAP/FeatureAnalysis/Classifier.py b/FAP/FeatureAnalysis/Classifier.py
@@ -9,6 +9,9 @@
 from abc import ABCMeta,abstractmethod
 
 class Classifier:
+    '''
+    This is the base class of the classifer. All the specific classifier need to be artributed from this base class.
+    '''
     def __init__(self):
         self.__model = None
         self._x = np.array([])

diff --git a/FAP/FeatureAnalysis/CrossValidation.py b/FAP/FeatureAnalysis/CrossValidation.py
@@ -14,6 +14,11 @@
 from FAP.Func.Visualization import LoadWaitBar
 
 class CrossValidation:
+    '''
+    CrossValidation is the base class to explore the hpyer-parameters. Now it supported Leave-one-lout (LOO), 10-folder,
+    and 5-folders. A classifider nust be set before run CV. A training metric and validation metric will be returned.
+    If a testing data container was also set, the test metric will be return.
+    '''
     def __init__(self, cv_method):
         self.__classifier = Classifier()
 
@@ -127,6 +132,10 @@ def Run(self, data_container, test_data_container=DataContainer(), store_folder=
         return train_metric, val_metric, test_metric
 
 class CrossValidationOnFeatureNumber(CrossValidation):
+    '''
+    This helps explore the effect of the number of features.
+    TODO: This exploration needs to be applied in the feature selector class. In may opinion, the
+    '''
     def __init__(self, cv_method, max_feature_number=1):
         super(CrossValidationOnFeatureNumber, self).__init__(cv_method)
         self.__max_feature_number = max_feature_number

diff --git a/FAP/FeatureAnalysis/FeaturePipeline.py b/FAP/FeatureAnalysis/FeaturePipeline.py
@@ -1,3 +1,8 @@
+'''.
+Jun 17, 2018.
+Yang SONG, [email protected]
+'''
+
 from FAP.DataContainer.DataContainer import DataContainer
 from FAP.FeatureAnalysis.CrossValidation import CrossValidation, CrossValidationOnFeatureNumber
 from FAP.FeatureAnalysis.FeatureSelector import *
@@ -6,23 +11,37 @@
 import pandas as pd
 
 class FeatureAnalysisExplore:
+    '''
+    This is the input of the FAP project. It accepts the candidate feature selector list and the candidate classifier
+    list. Then the result of the metrics were stored with the combination of different feature selector and classifier.
+    '''
     def __init__(self, feature_selector_list=[], classifier_list=[],
                  cv=CrossValidationOnFeatureNumber('5-folder'), max_feature_number=1):
         self.__feature_selector_list = feature_selector_list
         self.__classifier_list = classifier_list
         self.__cv = cv
         self.__max_feature_number = max_feature_number
 
-    def RunOneModel(self, data_container, feature_selector, classifier, cv, store_folder=''):
+    def RunOneModel(self, data_container, feature_selector, classifier, cv, test_data_container=DataContainer(), store_folder=''):
+        '''
+
+        :param data_container: The implement of the DataContainer.
+        :param feature_selector: The implement of the FeatureSelector.
+        :param classifier: The implement of the Classifier
+        :param cv: The implement of the CrossValidation
+        :param store_folder: The path of the store folder..
+        :return: The metric of the validation data.
+        '''
         feature_selector.SetDataContainer(data_container)
         selected_data_container = feature_selector.Run(store_folder)
 
         cv.SetClassifier(classifier)
         cv.SetDataContainer(selected_data_container)
 
-        train_metric, val_metric = cv.Run()
+        train_metric, val_metric, test_metric = cv.Run(data_container, test_data_container=test_data_container,
+                                                       store_folder=store_folder)
 
-        return val_metric
+        return val_metric, test_metric
 
     def Run(self, data_container, test_data_container=DataContainer(), store_folder=''):
 
@@ -75,11 +94,6 @@ def Run(self, data_container, test_data_container=DataContainer(), store_folder=
     data_container.Load(r'..\tempResult\NumericFeature.csv')
     data_container.UsualNormalize()
 
-    column_list = ['sample_number', 'positive_number', 'negative_number',
-                   'auc', 'auc 95% CIs', 'accuracy', 'feature_number',
-                   'Yorden Index', 'sensitivity', 'specificity',
-                   'positive predictive value', 'negative predictive value']
-
     df = pd.DataFrame(columns=column_list)
 
     # Set Feature Selector List

diff --git a/FAP/FeatureAnalysis/FeatureSelector.py b/FAP/FeatureAnalysis/FeatureSelector.py
@@ -525,17 +525,21 @@ def Run(self, data_container, store_folder=''):
 ################################################################
 
 if __name__ == '__main__':
+    import os
+    print(os.getcwd())
     from FAP.DataContainer.DataContainer import DataContainer
     data_container = DataContainer()
-    data_container.Load(r'..\Result\NumericFeature.csv')
+    print(os.path.abspath(r'..\..\Example\numeric_feature.csv'))
+    data_container.Load(r'..\..\Example\numeric_feature.csv')
     # data_container.UsualNormalize()
 
     print(data_container.GetArray().shape)
+    print(data_container.GetFeatureName())
 
-    fs = FeatureSelectByKeyName([['T1C', 'T2'], ['invoved', 'age'], [], []])
+    fs = FeatureSelectByKeyName([[], [], ['shape', 'firstorder', 'glrlm'], []], method='or')
 
     output = fs.Run(data_container)
-    print(output.GetFrame().head(5))
+    print(output.GetFeatureName())
 
     # fs1 = RemoveNonNumericFeature()
     # fs1.SetDataContainer(data_container)

diff --git a/FAP/Func/Metric.py b/FAP/Func/Metric.py
@@ -3,7 +3,17 @@
 from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
 
 def AUC_Confidence_Interval(y_true, y_pred, CI_index=0.95):
-    # AUC = roc_auc_score(y_true, y_pred)
+    '''
+    This function can help calculate the AUC value and the confidence intervals. It is note the confidence interval is
+    not calculated by the standard deviation. The auc is calculated by sklearn and the auc of the group are bootstraped
+    1000 times. the confidence interval are extracted from the bootstrap result.
+    :param y_true: The label, dim should be 1.
+    :param y_pred: The prediction, dim should be 1
+    :param CI_index: The range of confidence interval. Default is 95%
+    :return: The AUC value, a list of the confidence interval, the boot strap result.
+    '''
+
+    AUC = roc_auc_score(y_true, y_pred)
 
     n_bootstraps = 1000
     rng_seed = 42  # control reproducibility
@@ -32,12 +42,18 @@ def AUC_Confidence_Interval(y_true, y_pred, CI_index=0.95):
     confidence_upper = sorted_scores[int(1.0 - (1.0 - CI_index) / 2 * len(sorted_scores))]
     CI = [confidence_lower, confidence_upper]
 
-    AUC = sorted_scores[len(sorted_scores) // 2]
-
     # print('AUC is {:.3f}, Confidence interval : [{:0.3f} - {:0.3}]'.format(AUC, confidence_lower, confidence_upper))
     return AUC, CI, sorted_scores
 
 def EstimateMetirc(prediction, label, key_word=''):
+    '''
+    Calculate the medical metric according to prediction and the label.
+    :param prediction: The prediction. Dim is 1.
+    :param label: The label. Dim is 1
+    :param key_word: The word to add in front of the metric key. Usually to separate the training data set, validation
+    data set, and the testing data set.
+    :return: A dictionary of the calculated metrics
+    '''
     if key_word != '':
         key_word += '_'
 
@@ -46,7 +62,6 @@ def EstimateMetirc(prediction, label, key_word=''):
     metric[key_word + 'positive_number'] = np.sum(label)
     metric[key_word + 'negative_number'] = len(label) - np.sum(label)
 
-
     fpr, tpr, threshold = roc_curve(label, prediction)
     index = np.argmax(1 - fpr + tpr)
     metric[key_word + 'Yorden Index'] = '{:.4f}'.format(threshold[index])

diff --git a/FAP/Func/Visualization.py b/FAP/Func/Visualization.py
@@ -8,10 +8,10 @@ def DrawBoundaryOfBinaryMask(image, ROI):
 
 def LoadWaitBar(total, progress):
     '''
-    runs = 300
-    for run_num in range(runs):
-        time.sleep(.1)
-        updt(runs, run_num + 1)
+    Show the wait bar
+    :param total: the number of the total steps
+    :param progress: the number of the current step
+    :return:
     '''
     barLength, status = 20, ""
     raw_progress = progress

diff --git a/FAP/Visualization/DrawDoubleLines.py b/FAP/Visualization/DrawDoubleLines.py
@@ -3,6 +3,17 @@
 color_list = sns.color_palette('deep') + sns.color_palette('bright')
 
 def DrawDoubleYLines(x, y1, y2, xlabel='', ylabel=['', ''], legend=['', ''], store_path=''):
+    '''
+    Draw the doulbe y-axis lines.
+    :param x: The vector of the x axis.
+    :param y1: The vector of the y1 axis.
+    :param y2: The vector of the y2 axis.
+    :param xlabel: The label of the x. Default is ''
+    :param ylabel: The list of the y label. Default is ['', '']
+    :param legend: The list of the legend. Default is ['', '']
+    :param store_path: The store path of the figure. support 'jpg' and 'eps' format.
+    :return:
+    '''
     fig = plt.figure()
     ax1 = fig.add_subplot(111)
     ax1.plot(x, y1, color=color_list[0])

diff --git a/FAP/Visualization/DrawROCList.py b/FAP/Visualization/DrawROCList.py
@@ -9,11 +9,10 @@
 def DrawROCList(pred_list, label_list, name_list='', store_path=''):
     '''
     To Draw the ROC curve.
-
     :param pred_list: The list of the prediction.
     :param label_list: The list of the label.
     :param name_list: The list of the legend name.
-    :param store_path: The store path. Support jpeg and tif.
+    :param store_path: The store path. Support jpg and eps.
     :return: None
 
     Apr-28-18, Yang SONG [[email protected]]