liver_disease_detection_machine_learning.py

# -*- coding: utf-8 -*-
"""PCA_Liver_disease_article.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1M6PyB8Awmb-osk4ZrxMPuHzKeQQAKI0b
"""

import pandas as pd
import seaborn as sns
sns.set(rc={'figure.figsize':(8,8)})
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.metrics import confusion_matrix, accuracy_score

dataset = pd.read_csv('/mice_dat_pca.csv')
dataset.head(10)

X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values

"""Plot the histogram of the terget value"""

sns.histplot(y)

"""Generate synthetic data points"""

from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=0)

X_res, y_res = sm.fit_resample(X, y)

sns.histplot(y_res)

scaler_orig = StandardScaler()
X_orig_norm = scaler_orig.fit_transform(X)
pca = PCA(n_components=2)
X_proj = pca.fit_transform(X_orig_norm)

sns.scatterplot(x = X_proj[:, 0], y = X_proj[:, 1], hue = y)

scaler_smote = StandardScaler()
X_res_norm = scaler_smote.fit_transform(X_res)
pca_smote = PCA(n_components=2)
X_sm_proj = pca_smote.fit_transform(X_res_norm)

sns.scatterplot(x = X_sm_proj[:, 0], y = X_sm_proj[:, 1], hue = y_res)

"""Data splitting into test and train """

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, random_state = 0, test_size = 0.2)

"""Now we normalize X_train and X_test separately to avoid information leakage"""

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

"""SEE tutorial: https://www.datacamp.com/community/tutorials/understanding-logistic-regression-python

Classification using ANN (I reduced the number of neurons to avoid excessive overfitting)
"""

ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units= 6, activation='relu'))
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
ann.fit(X_train, y_train, batch_size = 32, epochs = 30)

y_pred_ANN = np.round(ann.predict(X_test), 0)
print(confusion_matrix(y_test, y_pred_ANN))
print(accuracy_score(y_test, y_pred_ANN))

"""Desicion trees"""

from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred_dt = classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred_dt))
print(accuracy_score(y_test, y_pred_dt))

"""Random Forest"""

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred_rf = classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred_rf))
print(accuracy_score(y_test, y_pred_rf))

import sklearn
sklearn.__version__

classifier.get_params(deep=True)

"""Support Vector Machine """

from sklearn import svm
classifier = svm.SVC(C=10, kernel='rbf', random_state = 0)
classifier.fit(X_train, y_train)

y_pred_svm = classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred_svm))
print(accuracy_score(y_test, y_pred_svm))

"""ROC curve for SVM

"""

import matplotlib.pyplot as plt
from itertools import cycle
from sklearn.metrics import roc_curve, auc
from scipy import interp
from sklearn.metrics import roc_auc_score

"""https://www.datatechnotes.com/2019/11/how-to-create-roc-curve-in-python.html

ROC curve for SVM
"""

# Compute ROC curve and ROC area for each class
y_true = y_test # ground truth labels
y_pred = y_pred_svm # predicted probabilities generated by sklearn classifier
fpr, tpr, thresholds = roc_curve(y_true,y_pred)
roc_auc = roc_auc_score(y_true,y_pred)
print("AUC of ROC Curve:", roc_auc)
plt.plot(fpr, tpr)
plt.title("ROC Curve for SVM (0.9674)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

plt.plot(fpr, tpr, label='ROC curve(area = %.2f)' %roc_auc)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random guess')
plt.title('ROC curve for SVM')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid()
plt.legend()
plt.show()

"""ROC for ANN"""

# Compute ROC curve and ROC area for each class/ ANN
y_true = y_test # ground truth labels
y_pred = y_pred_ANN # predicted probabilities generated by sklearn classifier
fpr, tpr, thresholds = roc_curve(y_true,y_pred)
roc_auc = roc_auc_score(y_true,y_pred)
print("AUC of ROC Curve:", roc_auc)
plt.plot(fpr, tpr)
plt.title("ROC Curve for ANN (0.8906)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

plt.plot(fpr, tpr, label='ROC curve(area = %.2f)' %roc_auc)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random guess')
plt.title('ROC curve for ANN')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid()
plt.legend()
plt.show()

"""ROC for Random Forest"""

# Compute ROC curve and ROC area for each class/ rf
y_true = y_test # ground truth labels
y_pred = y_pred_rf # predicted probabilities generated by sklearn classifier
fpr, tpr, thresholds = roc_curve(y_true,y_pred)
roc_auc = roc_auc_score(y_true,y_pred)
print("AUC of ROC Curve:", roc_auc)
plt.plot(fpr, tpr)
plt.title("ROC Curve for RF (0.98597)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

plt.plot(fpr, tpr, label='ROC curve(area = %.2f)' %roc_auc)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random guess')
plt.title('ROC curve for RF')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid()
plt.legend()
plt.show()

"""Acuuracy, F1 score, Precision"""

from sklearn import metrics
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_rf))
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred_rf))
# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred_rf))

""" K-fold cross-validated paired t test : RV vs. SVM


"""

clf1 = RandomForestClassifier(random_state=1)
clf2 = svm.SVC(random_state=1)

score1 = clf1.fit(X_train, y_train).score(X_test, y_test)
score2 = clf2.fit(X_train, y_train).score(X_test, y_test)

print('Random forest accuracy: %.2f%%' % (score1*100))
print('SVM accuracy: %.2f%%' % (score2*100))

from mlxtend.evaluate import paired_ttest_kfold_cv

t, p = paired_ttest_kfold_cv(estimator1=clf1,
                              estimator2=clf2,
                              X=X, y=y,
                              random_seed=1)

print('t statistic: %.3f' % t)
print('p value: %.3f' % p)

""" K-fold cross-validated paired t test : SVM vs. ANN

"""

clf1 = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
clf2 = svm.SVC(random_state=1)

score1 = clf1.fit(X_train, y_train).score(X_test, y_test)
score2 = clf2.fit(X_train, y_train).score(X_test, y_test)

print('Random forest accuracy: %.2f%%' % (score1*100))
print('SVM accuracy: %.2f%%' % (score2*100))

from mlxtend.evaluate import paired_ttest_kfold_cv

t, p = paired_ttest_kfold_cv(estimator1=clf1,
                              estimator2=clf2,
                              X=X, y=y,
                              random_seed=1)

print('t statistic: %.3f' % t)
print('p value: %.3f' % p)

"""K-fold cross-validated paired t test : RF vs. ANN"""

from mlxtend.evaluate import paired_ttest_kfold_cv

t, p = paired_ttest_kfold_cv(estimator1=clf1,
                              estimator2=clf2,
                              X=X, y=y,
                              random_seed=1)

print('t statistic: %.3f' % t)
print('p value: %.3f' % p)

clf1 = RandomForestClassifier(random_state=1)
clf2 = ann(criterion = 'entropy', random_state = 0)

score1 = clf1.fit(X_train, y_train).score(X_test, y_test)
score2 = clf2.fit(X_train, y_train).score(X_test, y_test)

print('Random forest accuracy: %.2f%%' % (score1*100))
print('SVM accuracy: %.2f%%' % (score2*100))