forked from epfl-ada/2024
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
dbe534b
commit 0111ade
Showing
4 changed files
with
1,375 additions
and
5 deletions.
There are no files selected for viewing
79 changes: 79 additions & 0 deletions
79
...al_documents/Exercises/07 - Learning from data - Applied machine learning/AppliedML.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
|
||
|
||
|
||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
import pandas as pd | ||
import scipy as sp | ||
from itertools import combinations | ||
import ast | ||
from sklearn.linear_model import LogisticRegression | ||
import seaborn as sn | ||
%matplotlib inline | ||
|
||
data_folder = './data/' | ||
|
||
|
||
|
||
|
||
|
||
columns = ['animal_type', 'intake_year', 'intake_condition', 'intake_number', 'intake_type', 'sex_upon_intake', \ | ||
'age_upon_intake_(years)', 'time_in_shelter_days', 'sex_upon_outcome', 'age_upon_outcome_(years)', \ | ||
'outcome_type'] | ||
original_data = pd.read_csv(data_folder+'aac_intakes_outcomes.csv', usecols=columns) | ||
original_data.head() | ||
|
||
|
||
|
||
|
||
|
||
data_features['adopted'] = data_features.outcome_type.apply(lambda r: 1 if r=='Adoption' else 0) | ||
data_features.drop("outcome_type", axis = 1, inplace=True) | ||
|
||
|
||
# Dummy encoding | ||
dummy_data = pd.get_dummies(original_data, columns=['outcome_type', 'sex_upon_outcome','animal_type','intake_condition','intake_type','sex_upon_intake',]) | ||
dummy_data | ||
|
||
|
||
# Standardize | ||
dummy_data =( dummy_data - dummy_data.mean() ) / dummy_data.std() | ||
dummy_data | ||
|
||
|
||
def split_set(data_to_split, ratio=0.8): | ||
mask = np.random.rand(len(data_to_split)) < ratio | ||
return [data_to_split[mask].reset_index(drop=True), data_to_split[~mask].reset_index(drop=True)] | ||
|
||
|
||
[train, test] = split_set(dummy_data) | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
211 changes: 211 additions & 0 deletions
211
...ts/Exercises/07 - Learning from data - Applied machine learning/AppliedML_solutions.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
|
||
|
||
|
||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
import pandas as pd | ||
import scipy as sp | ||
from itertools import combinations | ||
import ast | ||
from sklearn.linear_model import LogisticRegression | ||
import seaborn as sn | ||
%matplotlib inline | ||
|
||
data_folder = './data/' | ||
|
||
|
||
|
||
|
||
|
||
columns = ['animal_type', 'intake_year', 'intake_condition', 'intake_number', 'intake_type', 'sex_upon_intake', \ | ||
'age_upon_intake_(years)', 'time_in_shelter_days', 'sex_upon_outcome', 'age_upon_outcome_(years)', \ | ||
'outcome_type'] | ||
original_data = pd.read_csv(data_folder+'aac_intakes_outcomes.csv', usecols=columns) | ||
|
||
|
||
print('The length of the data with all rows is : {}'.format(len(original_data))) | ||
original_data.dropna(inplace=True) | ||
print('The length of the data without the rows with nan value is: {}'.format(len(original_data))) | ||
|
||
|
||
data_features = original_data.copy() | ||
data_features['adopted'] = data_features.outcome_type.apply(lambda r: 1 if r=='Adoption' else 0) | ||
data_features.drop("outcome_type", axis = 1, inplace=True) | ||
data_features.head() | ||
|
||
|
||
def split_set(data_to_split, ratio=0.8): | ||
mask = np.random.rand(len(data_to_split)) < ratio | ||
return [data_to_split[mask].reset_index(drop=True), data_to_split[~mask].reset_index(drop=True)] | ||
|
||
|
||
[train, test] = split_set(data_features) | ||
|
||
|
||
categorical_columns = ['sex_upon_outcome', 'animal_type', 'intake_condition', | ||
'intake_type', 'sex_upon_intake'] | ||
train_categorical = pd.get_dummies(train, columns=categorical_columns) | ||
train_categorical.columns | ||
|
||
|
||
|
||
|
||
|
||
# Make sure we use only the features available in the training set | ||
test_categorical = pd.get_dummies(test, columns=categorical_columns)[train_categorical.columns] | ||
|
||
|
||
train_label=train_categorical.adopted | ||
train_features = train_categorical.drop('adopted', axis=1) | ||
print('Length of the train dataset : {}'.format(len(train))) | ||
|
||
test_label=test_categorical.adopted | ||
test_features = test_categorical.drop('adopted', axis=1) | ||
print('Length of the test dataset : {}'.format(len(test))) | ||
|
||
|
||
means = train_features.mean() | ||
stddevs = train_features.std() | ||
|
||
train_features_std = pd.DataFrame() | ||
for c in train_features.columns: | ||
train_features_std[c] = (train_features[c]-means[c])/stddevs[c] | ||
|
||
# Use the mean and stddev of the training set | ||
test_features_std = pd.DataFrame() | ||
for c in test_features.columns: | ||
test_features_std[c] = (test_features[c]-means[c])/stddevs[c] | ||
|
||
train_features_std.head() | ||
|
||
|
||
|
||
|
||
|
||
def compute_confusion_matrix(true_label, prediction_proba, decision_threshold=0.5): | ||
|
||
predict_label = (prediction_proba[:,1]>decision_threshold).astype(int) | ||
|
||
TP = np.sum(np.logical_and(predict_label==1, true_label==1)) | ||
TN = np.sum(np.logical_and(predict_label==0, true_label==0)) | ||
FP = np.sum(np.logical_and(predict_label==1, true_label==0)) | ||
FN = np.sum(np.logical_and(predict_label==0, true_label==1)) | ||
|
||
confusion_matrix = np.asarray([[TP, FP], | ||
[FN, TN]]) | ||
return confusion_matrix | ||
|
||
|
||
def plot_confusion_matrix(confusion_matrix): | ||
[[TP, FP],[FN, TN]] = confusion_matrix | ||
label = np.asarray([['TP {}'.format(TP), 'FP {}'.format(FP)], | ||
['FN {}'.format(FN), 'TN {}'.format(TN)]]) | ||
|
||
df_cm = pd.DataFrame(confusion_matrix, index=['Yes', 'No'], columns=['Positive', 'Negative']) | ||
|
||
return sn.heatmap(df_cm, cmap='YlOrRd', annot=label, annot_kws={"size": 16}, cbar=False, fmt='') | ||
|
||
|
||
def compute_all_score(confusion_matrix, t=0.5): | ||
[[TP, FP],[FN, TN]] = confusion_matrix.astype(float) | ||
|
||
accuracy = (TP+TN)/np.sum(confusion_matrix) | ||
|
||
precision_positive = TP/(TP+FP) if (TP+FP) !=0 else np.nan | ||
precision_negative = TN/(TN+FN) if (TN+FN) !=0 else np.nan | ||
|
||
recall_positive = TP/(TP+FN) if (TP+FN) !=0 else np.nan | ||
recall_negative = TN/(TN+FP) if (TN+FP) !=0 else np.nan | ||
|
||
F1_score_positive = 2 *(precision_positive*recall_positive)/(precision_positive+recall_positive) if (precision_positive+recall_positive) !=0 else np.nan | ||
F1_score_negative = 2 *(precision_negative*recall_negative)/(precision_negative+recall_negative) if (precision_negative+recall_negative) !=0 else np.nan | ||
|
||
return [t, accuracy, precision_positive, recall_positive, F1_score_positive, precision_negative, recall_negative, F1_score_negative] | ||
|
||
|
||
logistic = LogisticRegression(solver='lbfgs', max_iter=10000) | ||
logistic.fit(train_features_std,train_label) | ||
|
||
|
||
prediction_proba = logistic.predict_proba(test_features_std) | ||
|
||
|
||
confusion_matrix_05 = compute_confusion_matrix(test_label, prediction_proba, 0.5 ) | ||
plt.figure(figsize = (4,3)) | ||
ax = plot_confusion_matrix(confusion_matrix_05) | ||
plt.xlabel('Actual') | ||
plt.ylabel('Predicted') | ||
plt.title('Confusion matrix for a 0.5 threshold') | ||
|
||
|
||
[t, accuracy, precision_positive, recall_positive, F1_score_positive, \ | ||
precision_negative, recall_negative, F1_score_negative] = compute_all_score(confusion_matrix_05) | ||
|
||
print("The accuracy of this model is {0:1.3f}".format(accuracy)) | ||
print("For the positive case, the precision is {0:1.3f}, the recall is {1:1.3f} and the F1 score is {2:1.3f}"\ | ||
.format(precision_positive, recall_positive, F1_score_positive)) | ||
print("For the negative case, the precision is {0:1.3f}, the recall is {1:1.3f} and the F1 score is {2:1.3f}"\ | ||
.format(precision_negative, recall_negative, F1_score_negative)) | ||
|
||
|
||
|
||
|
||
|
||
threshold = np.linspace(0, 1, 100) | ||
|
||
|
||
columns_score_name = ['Threshold', 'Accuracy', 'Precision P', 'Recall P', 'F1 score P', \ | ||
'Precision N', 'Recall N', 'F1 score N'] | ||
threshold_score = pd.concat([pd.DataFrame([compute_all_score(compute_confusion_matrix(test_label, prediction_proba, t ),t)]\ | ||
, columns=columns_score_name) for t in threshold], ignore_index=True) | ||
threshold_score.set_index('Threshold', inplace=True) | ||
|
||
|
||
threshold_score['Accuracy'].plot(grid=True).set_title('Accuracy') | ||
|
||
|
||
fig, axs = plt.subplots(nrows=2, ncols=3, sharex=True, sharey=True, figsize=(10,5)) | ||
|
||
col_plot = ['Precision P', 'Recall P', 'F1 score P', 'Precision N', 'Recall N', 'F1 score N'] | ||
|
||
major_ticks = np.linspace(0,1,5) | ||
|
||
for axe, col in zip(axs.flat, col_plot): | ||
threshold_score[col].plot(ax=axe, grid = True) | ||
axe.set_title(col) | ||
axe.set_xticks(major_ticks) | ||
axe.grid(which='major', alpha=0.5) | ||
|
||
|
||
|
||
|
||
|
||
logistic = LogisticRegression(solver='lbfgs', max_iter=10000) | ||
logistic.fit(train_features_std, train_label) | ||
|
||
|
||
tmp = [] | ||
for name, value in zip(train_features_std.columns, logistic.coef_[0]): | ||
tmp.append({"name": name, "value": value}) | ||
|
||
features_coef = pd.DataFrame(tmp).sort_values("value") | ||
features_coef.head() | ||
|
||
|
||
plt.subplots(figsize=(5,7)) | ||
plt.barh(features_coef.name, features_coef.value, alpha=0.6) | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
Oops, something went wrong.