Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
bnsreenu authored Feb 4, 2021
1 parent f9002e3 commit 7663ef1
Showing 1 changed file with 134 additions and 0 deletions.
134 changes: 134 additions & 0 deletions 198_Boruta_feature_selection_breast_cancer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# https://youtu.be/_5t8ZtRybT8

"""
https://pypi.org/project/Boruta/
pip install Boruta
Dataset:
https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)
"""

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns


df = pd.read_csv("data/wisconsin_breast_cancer_dataset.csv")

print(df.describe().T) #Values need to be normalized before fitting.
print(df.isnull().sum())
#df = df.dropna()

#Rename Dataset to Label to make it easy to understand
df = df.rename(columns={'Diagnosis':'Label'})
print(df.dtypes)

#Understand the data
#sns.countplot(x="Label", data=df) #M - malignant B - benign


####### Replace categorical values with numbers########
df['Label'].value_counts()

#Define the dependent variable that needs to be predicted (labels)
y = df["Label"].values

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
Y = labelencoder.fit_transform(y) # M=1 and B=0
#################################################################
#Define x and normalize values

#Define the independent variables. Let's also drop Gender, so we can normalize other data
X = df.drop(labels = ["Label", "ID"], axis=1)

import numpy as np
feature_names = np.array(X.columns) #Convert dtype string?


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

##Split data into train and test to verify accuracy after fitting the model.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

###########################################################################
# Define XGBOOST classifier to be used by Boruta
import xgboost as xgb
model = xgb.XGBClassifier() #For Boruta

"""
Create shadow features – random features and shuffle values in columns
Train Random Forest / XGBoost and calculate feature importance via mean decrease impurity
Check if real features have higher importance compared to shadow features
Repeat this for every iteration
If original feature performed better, then mark it as important
"""

from boruta import BorutaPy

# define Boruta feature selection method
feat_selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=1)

# find all relevant features
feat_selector.fit(X_train, y_train)


# check selected features
print(feat_selector.support_) #Should we accept the feature

# check ranking of features
print(feat_selector.ranking_) #Rank 1 is the best

# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X_train) #Apply feature selection and return transformed data

"""
Review the features
"""
# zip feature names, ranks, and decisions
feature_ranks = list(zip(feature_names,
feat_selector.ranking_,
feat_selector.support_))

# print the results
for feat in feature_ranks:
print('Feature: {:<30} Rank: {}, Keep: {}'.format(feat[0], feat[1], feat[2]))


############################################################
#Now use the subset of features to fit XGBoost model on training data
import xgboost as xgb
xgb_model = xgb.XGBClassifier()

xgb_model.fit(X_filtered, y_train)

#Now predict on test data using the trained model.

#First apply feature selector transform to make sure same features are selected from test data
X_test_filtered = feat_selector.transform(X_test)
prediction_xgb = xgb_model.predict(X_test_filtered)


#Print overall accuracy
from sklearn import metrics
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction_xgb))

#Confusion Matrix - verify accuracy of each class
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, prediction_xgb)
#print(cm)
sns.heatmap(cm, annot=True)

#######################################################





0 comments on commit 7663ef1

Please sign in to comment.