Skip to content

Commit

Permalink
Add the description of the main classes and the main functions. This …
Browse files Browse the repository at this point in the history
…make the code readable.
  • Loading branch information
salan668 committed Jun 18, 2018
1 parent c2e86a9 commit 5f2cb8d
Show file tree
Hide file tree
Showing 12 changed files with 150 additions and 60 deletions.
15 changes: 15 additions & 0 deletions FAP/DataContainer/DataContainer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
'''.
Jun 17, 2018.
Yang SONG, [email protected]
'''

import numpy as np
import os
import pandas as pd
Expand All @@ -6,6 +11,10 @@


class DataContainer:
'''
DataContainer is the key class of the FAP project. It is the node to connect different models. Almost all procesors
accept DataContainer and return a new DataContainer.
'''
def __init__(self, array=np.array([]), label=np.array([]), feature_name=[], case_name=[]):
self.__feature_name = feature_name
self.__case_name = case_name
Expand Down Expand Up @@ -121,6 +130,12 @@ def UsualAndL2Normalize(self, store_path='', axis=0):
df.to_csv(store_path)

def ArtefactNormalize(self, normalization_file):
'''
This function can use the existing file with the infoamtion of the normalization. It is usually used on the fact
that a learnt model is used to process the testing data set.
:param normalization_file: the stored file with the information of the normalization.
:return:
'''
df = pd.read_csv(normalization_file, header=0, index_col=0)
mean_value = df.loc['mean'].values
std_value = df.loc['std'].values
Expand Down
30 changes: 21 additions & 9 deletions FAP/DataContainer/DataSeperate.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
'''.
Jun 17, 2018.
Yang SONG, [email protected]
'''

import numpy as np
from random import shuffle
import os
import pandas as pd


from FAP.DataContainer.DataContainer import DataContainer

def SeperateDataToTrainingAndTesting(data, percentage=0.2, label=np.array(()), training_index_list = [], store_folder=''):
def SeperateDataToTrainingAndTesting(data, testing_percentage=0.2, label=np.array(()), training_index_list = [], store_folder=''):
is_label = True
if label.size == 0:
label = np.zeros((data.shape[0]), )
Expand All @@ -19,8 +23,8 @@ def SeperateDataToTrainingAndTesting(data, percentage=0.2, label=np.array(()), t
index = np.where(label == group)[0]

shuffle(index)
testing_index = index[:round(len(index) * percentage)]
training_index = index[round(len(index) * percentage):]
testing_index = index[:round(len(index) * testing_percentage)]
training_index = index[round(len(index) * testing_percentage):]

training_index_list.extend(training_index)
testing_index_list.extend(testing_index)
Expand Down Expand Up @@ -55,7 +59,16 @@ def SeperateDataToTrainingAndTesting(data, percentage=0.2, label=np.array(()), t
'training_index': training_index_list,
'testing_index': testing_index_list}

def GenerateTrainingAndTestingData(csv_file_path, training_index=[], percentage=0.3, is_store_index=False):
def GenerateTrainingAndTestingData(csv_file_path, training_index=[], testing_percentage=0.3, is_store_index=False):
'''
Seperate the data container into training part and the testing part.
:param csv_file_path: The file path of the data container
:param training_index: The index of the training data set. This is usually to compare with different combination
of the sequences. Default is []
:param testing_percentage: The percentage of data set is used to separate for testing data set. Default is 30%
:param is_store_index: To store or not. Default is False.
:return:
'''
data_container = DataContainer()
data, label, feature_name, case_name = data_container.LoadAndGetData(csv_file_path)
folder_path = os.path.split(csv_file_path)[0]
Expand All @@ -73,7 +86,7 @@ def GenerateTrainingAndTestingData(csv_file_path, training_index=[], percentage=
else:
store_folder = ''

output = SeperateDataToTrainingAndTesting(data, percentage, label, training_index_list=training_index, store_folder=store_folder)
output = SeperateDataToTrainingAndTesting(data, testing_percentage, label, training_index_list=training_index, store_folder=store_folder)

training_data_contrainer = DataContainer(output['training_data'], output['training_label'], feature_name,
[case_name[temp] for temp in output['training_index']])
Expand All @@ -84,9 +97,8 @@ def GenerateTrainingAndTestingData(csv_file_path, training_index=[], percentage=
testing_data_contrainer.Save(os.path.join(testing_folder, 'numeric_feature.csv'))


# Demo
if __name__ == '__main__':
GenerateTrainingAndTestingData(r'C:\MyCode\PythonScript\EyeEnt\lymphoma_MM\T1C_T2\T1_T2', percentage=0.3)
GenerateTrainingAndTestingData(r'C:\MyCode\PythonScript\EyeEnt\lymphoma_MM\T1C_T2\T1_T2', testing_percentage=0.3)
training_index = pd.read_csv(r'C:\MyCode\PythonScript\EyeEnt\lymphoma_MM\T1C_T2\T1_T2\training\training_index.csv')
training_index = training_index.values[:, 1].tolist()
GenerateTrainingAndTestingData(r'C:\MyCode\PythonScript\EyeEnt\lymphoma_MM\T1C_T2\T1', training_index=training_index)
Expand All @@ -97,4 +109,4 @@ def GenerateTrainingAndTestingData(csv_file_path, training_index=[], percentage=
training_index = pd.read_csv(r'C:\MyCode\PythonScript\EyeEnt\lymphoma_MM\T1C_T2\T1\training\training_index.csv')
print(training_index.values[:, 1].tolist())
training_index = pd.read_csv(r'C:\MyCode\PythonScript\EyeEnt\lymphoma_MM\T1C_T2\T2\training\training_index.csv')
print(training_index.values[:, 1].tolist())
print(training_index.values[:, 1].tolist())
3 changes: 3 additions & 0 deletions FAP/FeatureAnalysis/Classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
from abc import ABCMeta,abstractmethod

class Classifier:
'''
This is the base class of the classifer. All the specific classifier need to be artributed from this base class.
'''
def __init__(self):
self.__model = None
self._x = np.array([])
Expand Down
9 changes: 9 additions & 0 deletions FAP/FeatureAnalysis/CrossValidation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
from FAP.Func.Visualization import LoadWaitBar

class CrossValidation:
'''
CrossValidation is the base class to explore the hpyer-parameters. Now it supported Leave-one-lout (LOO), 10-folder,
and 5-folders. A classifider nust be set before run CV. A training metric and validation metric will be returned.
If a testing data container was also set, the test metric will be return.
'''
def __init__(self, cv_method):
self.__classifier = Classifier()

Expand Down Expand Up @@ -127,6 +132,10 @@ def Run(self, data_container, test_data_container=DataContainer(), store_folder=
return train_metric, val_metric, test_metric

class CrossValidationOnFeatureNumber(CrossValidation):
'''
This helps explore the effect of the number of features.
TODO: This exploration needs to be applied in the feature selector class. In may opinion, the
'''
def __init__(self, cv_method, max_feature_number=1):
super(CrossValidationOnFeatureNumber, self).__init__(cv_method)
self.__max_feature_number = max_feature_number
Expand Down
30 changes: 22 additions & 8 deletions FAP/FeatureAnalysis/FeaturePipeline.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
'''.
Jun 17, 2018.
Yang SONG, [email protected]
'''

from FAP.DataContainer.DataContainer import DataContainer
from FAP.FeatureAnalysis.CrossValidation import CrossValidation, CrossValidationOnFeatureNumber
from FAP.FeatureAnalysis.FeatureSelector import *
Expand All @@ -6,23 +11,37 @@
import pandas as pd

class FeatureAnalysisExplore:
'''
This is the input of the FAP project. It accepts the candidate feature selector list and the candidate classifier
list. Then the result of the metrics were stored with the combination of different feature selector and classifier.
'''
def __init__(self, feature_selector_list=[], classifier_list=[],
cv=CrossValidationOnFeatureNumber('5-folder'), max_feature_number=1):
self.__feature_selector_list = feature_selector_list
self.__classifier_list = classifier_list
self.__cv = cv
self.__max_feature_number = max_feature_number

def RunOneModel(self, data_container, feature_selector, classifier, cv, store_folder=''):
def RunOneModel(self, data_container, feature_selector, classifier, cv, test_data_container=DataContainer(), store_folder=''):
'''
:param data_container: The implement of the DataContainer.
:param feature_selector: The implement of the FeatureSelector.
:param classifier: The implement of the Classifier
:param cv: The implement of the CrossValidation
:param store_folder: The path of the store folder..
:return: The metric of the validation data.
'''
feature_selector.SetDataContainer(data_container)
selected_data_container = feature_selector.Run(store_folder)

cv.SetClassifier(classifier)
cv.SetDataContainer(selected_data_container)

train_metric, val_metric = cv.Run()
train_metric, val_metric, test_metric = cv.Run(data_container, test_data_container=test_data_container,
store_folder=store_folder)

return val_metric
return val_metric, test_metric

def Run(self, data_container, test_data_container=DataContainer(), store_folder=''):

Expand Down Expand Up @@ -75,11 +94,6 @@ def Run(self, data_container, test_data_container=DataContainer(), store_folder=
data_container.Load(r'..\tempResult\NumericFeature.csv')
data_container.UsualNormalize()

column_list = ['sample_number', 'positive_number', 'negative_number',
'auc', 'auc 95% CIs', 'accuracy', 'feature_number',
'Yorden Index', 'sensitivity', 'specificity',
'positive predictive value', 'negative predictive value']

df = pd.DataFrame(columns=column_list)

# Set Feature Selector List
Expand Down
10 changes: 7 additions & 3 deletions FAP/FeatureAnalysis/FeatureSelector.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,17 +525,21 @@ def Run(self, data_container, store_folder=''):
################################################################

if __name__ == '__main__':
import os
print(os.getcwd())
from FAP.DataContainer.DataContainer import DataContainer
data_container = DataContainer()
data_container.Load(r'..\Result\NumericFeature.csv')
print(os.path.abspath(r'..\..\Example\numeric_feature.csv'))
data_container.Load(r'..\..\Example\numeric_feature.csv')
# data_container.UsualNormalize()

print(data_container.GetArray().shape)
print(data_container.GetFeatureName())

fs = FeatureSelectByKeyName([['T1C', 'T2'], ['invoved', 'age'], [], []])
fs = FeatureSelectByKeyName([[], [], ['shape', 'firstorder', 'glrlm'], []], method='or')

output = fs.Run(data_container)
print(output.GetFrame().head(5))
print(output.GetFeatureName())

# fs1 = RemoveNonNumericFeature()
# fs1.SetDataContainer(data_container)
Expand Down
23 changes: 19 additions & 4 deletions FAP/Func/Metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,17 @@
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix

def AUC_Confidence_Interval(y_true, y_pred, CI_index=0.95):
# AUC = roc_auc_score(y_true, y_pred)
'''
This function can help calculate the AUC value and the confidence intervals. It is note the confidence interval is
not calculated by the standard deviation. The auc is calculated by sklearn and the auc of the group are bootstraped
1000 times. the confidence interval are extracted from the bootstrap result.
:param y_true: The label, dim should be 1.
:param y_pred: The prediction, dim should be 1
:param CI_index: The range of confidence interval. Default is 95%
:return: The AUC value, a list of the confidence interval, the boot strap result.
'''

AUC = roc_auc_score(y_true, y_pred)

n_bootstraps = 1000
rng_seed = 42 # control reproducibility
Expand Down Expand Up @@ -32,12 +42,18 @@ def AUC_Confidence_Interval(y_true, y_pred, CI_index=0.95):
confidence_upper = sorted_scores[int(1.0 - (1.0 - CI_index) / 2 * len(sorted_scores))]
CI = [confidence_lower, confidence_upper]

AUC = sorted_scores[len(sorted_scores) // 2]

# print('AUC is {:.3f}, Confidence interval : [{:0.3f} - {:0.3}]'.format(AUC, confidence_lower, confidence_upper))
return AUC, CI, sorted_scores

def EstimateMetirc(prediction, label, key_word=''):
'''
Calculate the medical metric according to prediction and the label.
:param prediction: The prediction. Dim is 1.
:param label: The label. Dim is 1
:param key_word: The word to add in front of the metric key. Usually to separate the training data set, validation
data set, and the testing data set.
:return: A dictionary of the calculated metrics
'''
if key_word != '':
key_word += '_'

Expand All @@ -46,7 +62,6 @@ def EstimateMetirc(prediction, label, key_word=''):
metric[key_word + 'positive_number'] = np.sum(label)
metric[key_word + 'negative_number'] = len(label) - np.sum(label)


fpr, tpr, threshold = roc_curve(label, prediction)
index = np.argmax(1 - fpr + tpr)
metric[key_word + 'Yorden Index'] = '{:.4f}'.format(threshold[index])
Expand Down
8 changes: 4 additions & 4 deletions FAP/Func/Visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ def DrawBoundaryOfBinaryMask(image, ROI):

def LoadWaitBar(total, progress):
'''
runs = 300
for run_num in range(runs):
time.sleep(.1)
updt(runs, run_num + 1)
Show the wait bar
:param total: the number of the total steps
:param progress: the number of the current step
:return:
'''
barLength, status = 20, ""
raw_progress = progress
Expand Down
11 changes: 11 additions & 0 deletions FAP/Visualization/DrawDoubleLines.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,17 @@
color_list = sns.color_palette('deep') + sns.color_palette('bright')

def DrawDoubleYLines(x, y1, y2, xlabel='', ylabel=['', ''], legend=['', ''], store_path=''):
'''
Draw the doulbe y-axis lines.
:param x: The vector of the x axis.
:param y1: The vector of the y1 axis.
:param y2: The vector of the y2 axis.
:param xlabel: The label of the x. Default is ''
:param ylabel: The list of the y label. Default is ['', '']
:param legend: The list of the legend. Default is ['', '']
:param store_path: The store path of the figure. support 'jpg' and 'eps' format.
:return:
'''
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(x, y1, color=color_list[0])
Expand Down
3 changes: 1 addition & 2 deletions FAP/Visualization/DrawROCList.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,10 @@
def DrawROCList(pred_list, label_list, name_list='', store_path=''):
'''
To Draw the ROC curve.
:param pred_list: The list of the prediction.
:param label_list: The list of the label.
:param name_list: The list of the legend name.
:param store_path: The store path. Support jpeg and tif.
:param store_path: The store path. Support jpg and eps.
:return: None
Apr-28-18, Yang SONG [[email protected]]
Expand Down
Loading

0 comments on commit 5f2cb8d

Please sign in to comment.