Cervical_cancer_data_preperation.Rmd

---
title: 'ADS 503: Cervical Cancer Biopsy Prediction Project'
author: "Ruddy Simonpour & Shailja Somani"
date: "May 30, 2023"
output:
  word_document: default
  pdf_document: default
---

```{r, warning=FALSE, message='hide'}
# load necessary packages for files above
library(Hmisc)
library(dplyr)
library(pROC)
library(reshape2)
library(ggplot2)
library(caret)
library(ROSE)
```

```{r}
suppressWarnings({

#setwd("/Users/shailjasomani/Documents/USD_MS_ADS/ADS_503/Final_Proj")   #choose a location/path and set the working directory - will only set for this chunk though
  
setwd("/Users/ruddysimonpour/Desktop/University of Sandiego - Curriculum/ADS 503 - Applied Predictive Modeling/ADS503-Applied-Predictive-Modeling/Pipeline")   #choose a location/path and set the working directory

source ("Data_Ingestion.R")
source ("Viz_EDA.R")
source ("Preprocessing.R")
source ("Modeling.R")
})
```

# Data Importing

```{r}
# Uses functions from files loaded in to clean data
set.seed(007)

# loading Data
cervical_data_raw <- read_data(x="/Users/ruddysimonpour/Desktop/University of Sandiego - Curriculum/ADS 503 - Applied Predictive Modeling/ADS503-Applied-Predictive-Modeling/input_resource/kag_risk_factors_cervical_cancer.csv")

#cervical_data_raw <- read_data(x='/Users/shailjasomani/Documents/USD_MS_ADS/ADS_503/Final_Proj/kag_risk_factors_cervical_cancer.csv')
head(cervical_data_raw,5)

dim(cervical_data_raw)

# check missing data
null_counts_raw <- check_nulls(cervical_data_raw)

# remove cols with more than 85% missing data
cervical_data_clean <- remove_cols(cervical_data_raw)

dim(cervical_data_clean)
```

# EDA Analysis 

```{r, results='hide'}
# These user-defined functions are pulled from the Viz_EDA.R file.
# Look at all histograms of features collectively
hist.df(cervical_data_clean)

# Create boxplots for all features - helps visualize outliers
boxplot.df(cervical_data_clean)
```
# Data Cleaning

```{r}
library(caret)

# remove near zero variance variables
dim(cervical_data_clean)
degeneratecols <- nearZeroVar(cervical_data_clean)

length(degeneratecols) # number of cols that are degenerate distributions

cervical_data_process <- cervical_data_clean[, -degeneratecols]
dim(cervical_data_process)

# impute missing values with knn
#data_clean <- impute_with_knn(cervical_data_process, k = 29) # the rule of thumbs choosing the k is the square root of the number of samples
preproc <- preProcess(cervical_data_process, method = ("knnImpute"))
data_clean <- predict(preproc, cervical_data_process)

# since knn imputation create new columns, we will exclude the new columns from our dataset
data_clean <- subset(data_clean, select = Age:Biopsy)

null_counts_clean <- check_nulls(data_clean)
```


## EDA - Correlations Analysis

```{r}
# convert factor to numeric
data_clean$Biopsy <- as.numeric(data_clean$Biopsy)

# Feed into our heatmap function
heatmap <- create_heatmap("Cervical Cancer Dataset Variable Correlations", data_clean)

# Display the heatmap
print(heatmap)

ggsave(filename = "cor-matrix.png", plot = heatmap, width = 7, height = 7)
```

## Check highly correlated predictors 

```{r}

highlyCorrelated <- findCorrelation(cor(data_clean), cutoff = 0.9)

print(names(data_clean)[highlyCorrelated])

# drop highly correlated variables
data_clean <- data_clean[, -highlyCorrelated]
```

## Convert the class to factor variable

```{r}
# initial look at the target variable
data_clean$Biopsy<-as.factor(data_clean$Biopsy) # convert class to factor
levels(data_clean$Biopsy) <- c("No", "Yes") # names of the factors
```


# Data Partitioning (Train and Test Split)

```{r, warning=FALSE}
# data splitting
set.seed(100)

trainIndex <- createDataPartition(data_clean$Biopsy, p = .8, list = FALSE)
trainData <- data_clean[trainIndex, ]
testData  <- data_clean[-trainIndex, ]

train_X <- trainData[ , !(names(trainData) %in% "Biopsy")]
train_y <- trainData$Biopsy

test_X <- testData[ , !(names(testData) %in% "Biopsy")]
test_y <- testData$Biopsy


############################################### Imbalance class 

# plotting number of samples in each class - original dataset
options(scipen=10000)

train_y_df <- data.frame(Biopsy = train_y)

# Create the plot
p <- ggplot(data = train_y_df, aes(x = Biopsy, fill = Biopsy)) +
    geom_bar() +
    geom_text(stat='count', aes(label=..count..), vjust=1) +
    ggtitle("Number of samples in each class", subtitle = "Original dataset") +
    xlab("") +
    ylab("Samples") +
    scale_y_continuous(expand = c(0,0)) +
    scale_x_discrete(expand = c(0,0)) +
    theme(legend.position = "none", 
         legend.title = element_blank(),
         panel.grid.major = element_blank(),
         panel.grid.minor = element_blank(),
         panel.background = element_blank())
p 
ggsave(filename = "Images/class_imbalance1.png", plot = p, width = 7, height = 7)

```


## Class Imbalanace (ROSE)

```{r}

#Implementing ROSE function to handle class imbalance problem

library(ROSE)

set.seed(100)

rose_train <- ROSE(Biopsy ~ ., data = trainData)$data

train_X <- rose_train[ , !(names(rose_train) %in% "Biopsy")]
train_y <- rose_train$Biopsy


options(scipen=10000) 

train_y_df <- data.frame(Biopsy = train_y)

p1 <- ggplot(data = train_y_df, aes(x = Biopsy,fill = Biopsy)) +
    geom_bar()+
    geom_text(stat='count', aes(label=..count..), vjust=1) +
    ggtitle("Number of samples in each class after ROSE technique implementation", subtitle = "Original dataset") +
    xlab("")+
    ylab("Samples")+
    scale_y_continuous(expand = c(0,0))+
    scale_x_discrete(expand = c(0,0))+
    theme(legend.position = "none", 
         legend.title = element_blank(),
         panel.grid.major = element_blank(),
         panel.grid.minor = element_blank(),
         panel.background = element_blank())
p1

ggsave(filename = "Images/class_imbalance2.png", plot = p1, width = 7, height = 4)

```

## Data Pre-Processing

```{r}
preProcValues <- preProcess(train_X, 
                            method = c("center", "scale"))

train_X <- predict(preProcValues, train_X)
test_X <- predict(preProcValues, test_X)


cntrl <- trainControl(method = "cv", number = 10,
                     summaryFunction = twoClassSummary,
                     classProbs = TRUE,
                     savePredictions = TRUE)

```

# Modeling 

## Non-Linear models

### Neural Network Model

```{r}
### Neural Network Model
nnet_model <- train_nnet_model(train_X, train_y, ncol(trainData), cntrl)
```


```{r}
# get prediction result
testResults_nnet <- get_prediction_results(nnet_model, test_X, test_y)

# convert prediction levels to match observation
testResults_nnet$prediction <- ifelse(testResults_nnet$prediction == "1", "Yes", "No")

# confusion matrix
cm <- confusionMatrix(as.factor(testResults_nnet$prediction), as.factor(testResults_nnet$observation))
print(cm)

# neural network model result plot
plot(nnet_model)

nnet_model$finalModel

# roc/auc result
roc_nnet <- roc(testResults_nnet$observation, testResults_nnet$class_prob)
auc(roc_nnet)
plot(roc_nnet)

```

### Multivariate Adaptive Regression Splines (MARS)

```{r}
mars_model <- train_mars_model(train_X, train_y, 2:20, cntrl)
```


```{r}
# get prediction result
testResults_mars <- get_prediction_results(mars_model, test_X, test_y)

# convert prediction levels to match observation
testResults_mars$prediction <- ifelse(testResults_mars$prediction == "1", "Yes", "No")

# confusion matrix
cm <- confusionMatrix(as.factor(testResults_mars$prediction), as.factor(testResults_mars$observation))
print(cm)

# mars model result plot
plot(mars_model)

mars_model$finalModel

# roc/auc result
roc_mars <- roc(testResults_mars$observation, testResults_mars$class_prob)
auc(roc_mars)
plot(roc_mars)
```

### Support Vector Machine (SVM) 

#### svmRadial

```{r, warning=FALSE}
svm_model <- train_svm_model(train_X, train_y, 20, cntrl)
```

```{r}
# get prediction result
testResults_svm <- get_prediction_results(svm_model, test_X, test_y)

# convert prediction levels to match observation
testResults_svm$prediction <- ifelse(testResults_svm$prediction == "1", "Yes", "No")

# confusion matrix
cm <- confusionMatrix(as.factor(testResults_svm$prediction), as.factor(testResults_svm$observation))
print(cm)

# svm Radial result plot
plot(svm_model)

svm_model$finalModel

# roc/auc result
roc_svm <- roc(testResults_svm$observation, testResults_svm$class_prob)
auc(roc_svm)
plot(roc_svm)

```

#### svmPoly

```{r, warning=FALSE}
svm_modelPoly <- train_svm_poly(train_X, train_y, cntrl)
```


```{r}
# get prediction result
testResults_svmP <- get_prediction_results(svm_modelPoly, test_X, test_y)


# convert prediction levels to match observation
testResults_svmP$prediction <- ifelse(testResults_svmP$prediction == "1", "Yes", "No")


# confusion matrix
cm <- confusionMatrix(as.factor(testResults_svmP$prediction), as.factor(testResults_svmP$observation))
print(cm)

# svm Poly result plot
plot(svm_modelPoly)

svm_modelPoly$finalModel

# roc/auc result
roc_svmp <- roc(testResults_svmP$observation, testResults_svmP$class_prob)
auc(roc_svmp)
plot(roc_svmp)
```
### K-Nearest Neighbors

```{r}
knn_model <- knn_model_train(train_X, train_y, cntrl, 1:11)
```

```{r}
# get prediction result
testResults_knn <- get_prediction_results(knn_model, test_X, test_y)


# convert prediction levels to match observation
testResults_knn$prediction <- ifelse(testResults_knn$prediction == "1", "Yes", "No")

# confusion matrix
cm <- confusionMatrix(as.factor(testResults_knn$prediction), as.factor(testResults_knn$observation))
print(cm)

# kNN result plot
plot(knn_model)

knn_model$finalModel

# roc/auc result
roc_knn <- roc(testResults_knn$observation, testResults_knn$class_prob)
auc(roc_knn)
plot(roc_knn)
```
### Random Forest Model

```{r}
rf_model <- rf_model_train(train_X, train_y, cntrl)
```


```{r}
# get prediction result
testResults_rf <- get_prediction_results(rf_model, test_X, test_y)

# convert prediction levels to match observation
testResults_rf$prediction <- ifelse(testResults_rf$prediction == "1", "Yes", "No")

# confusion matrix
cm <- confusionMatrix(as.factor(testResults_rf$prediction), as.factor(testResults_rf$observation))
print(cm)

# RF result plot
plot(rf_model)

rf_model$finalModel

# roc/auc result
roc_rf <- roc(testResults_rf$observation, testResults_rf$class_prob)
auc(roc_rf)
plot(roc_rf)
```

## Linear Model

### Logistic Regression

```{r, warning=FALSE}
lr_model <- lr_model_train(train_X, train_y, cntrl)
```

```{r}
# get prediction result
testResults_lr <- get_prediction_results(lr_model, test_X, test_y)


# convert prediction levels to match observation
testResults_lr$prediction <- ifelse(testResults_lr$prediction == "1", "Yes", "No")


# confusion matrix
cm <- confusionMatrix(as.factor(testResults_lr$prediction), as.factor(testResults_lr$observation))
print(cm)

# roc/auc result

roc_lr <- roc(testResults_lr$observation, testResults_lr$class_prob)
auc(roc_lr)
plot(roc_lr)
```

### LDA Model
```{r}
lda_model <- lda_model_train(train_X, train_y, cntrl)
```

```{r}
# get prediction result
testResults_lda <- get_prediction_results(lda_model, test_X, test_y)

# convert prediction levels to match observation
testResults_lda$prediction <- ifelse(testResults_lda$prediction == "1", "Yes", "No")


# confusion matrix
cm <- confusionMatrix(as.factor(testResults_lda$prediction), as.factor(testResults_lda$observation))
print(cm)

# roc/auc result
roc_lda <- roc(testResults_lda$observation, testResults_lda$class_prob)
auc(roc_lda)
plot(roc_lda)
```

### Penalized Logistic Regression
```{r}
glmn_model <- glmn_model_train(train_X, train_y, cntrl)
```

```{r}
# get prediction result
testResults_glmn <- get_prediction_results(glmn_model, test_X, test_y)

# convert prediction levels to match observation
testResults_glmn$prediction <- ifelse(testResults_glmn$prediction == "1", "Yes", "No")

# confusion matrix
cm <- confusionMatrix(as.factor(testResults_glmn$prediction), as.factor(testResults_glmn$observation))
print(cm)

# roc/auc result
roc_glmn <- roc(testResults_glmn$observation, testResults_glmn$class_prob)
auc(roc_glmn)
plot(roc_glmn)
```

### Nearest Shrunken Centroids
```{r}
nsc_model <- nsc_model_train(train_X, train_y, cntrl)
```

```{r}
# get prediction result
testResults_nsc <- get_prediction_results(nsc_model, test_X, test_y)

# convert prediction levels to match observation
testResults_nsc$prediction <- ifelse(testResults_nsc$prediction == "1", "Yes", "No")


# confusion matrix
cm <- confusionMatrix(as.factor(testResults_nsc$prediction), as.factor(testResults_nsc$observation))
print(cm)

# roc/auc result
roc_nsc <- roc(testResults_nsc$observation, testResults_nsc$class_prob)
auc(roc_nsc)
plot(roc_nsc)
```

# Final Model Evaluation & Enhancements


```{r}
png(filename = "roc_curve_comparison.png", width = 800, height = 600)

### Compare Models using ROC curve
par(mar = c(9, 1, 0, 9))

# Non-linear model plots
plot(roc_nnet, type = "s", col = 'red', legacy.axes = TRUE)
plot(roc_svm, type = "s", add = TRUE, col = 'blue', legacy.axes = TRUE)
plot(roc_svmp, type = "s", add = TRUE, col = 'brown', legacy.axes = TRUE)
plot(roc_knn, type = "s", add = TRUE, col = 'yellow', legacy.axes = TRUE)
plot(roc_rf, type = "s", add = TRUE, col = 'orange', legacy.axes = TRUE)

# Linear model plots
plot(roc_lr, type = "s", add = TRUE, col = 'black', legacy.axes = TRUE)
plot(roc_lda, type = "s", add = TRUE, col = 'gray', legacy.axes = TRUE)
plot(roc_glmn, type = "s", add = TRUE, col = 'darkgray', legacy.axes = TRUE)
plot(roc_nsc, type = "s", add = TRUE, col = 'lightgray', legacy.axes = TRUE)

# Update the legend to include the new models
legend("bottomright", legend=c("Neural Network", "Support Vector Machine (svmRadial)", "Support Vector Machine (svmPoly)", "K-Nearest Neighbor", "Random Forest", "Logistic Regression", "Linear Discriminant Analysis", "Penalized Logistic Regression", "Nearest Shrunken Centroids"),
       col=c("red", "blue", "brown", "yellow", "orange", "black", "gray", "purple", "pink"), lwd=2, bty = "n", xpd = TRUE)

title(main = "Compare ROC curves from Various Models")

```

## Model performance based on different metrics (AUC/ROC, Accuracy)

```{r}

# auc result
nnetAuc <- auc(roc_nnet)
marsAuc <- auc(roc_mars)
svmAuc <- auc(roc_svm)
svmpAuc <- auc(roc_svmp)
knnAuc <- auc(roc_knn)
rfAuc <- auc(roc_rf)
lrAuc <- auc(roc_lr)
ldaAuc <- auc(roc_lda)
glmnAuc <- auc(roc_glmn)
nscAuc <- auc(roc_nsc)

# accuracy result
nnetAcc <- get_accuracy(nnet_model, test_X, test_y)
marsAcc <- get_accuracy(mars_model, test_X, test_y)
svmAcc <- get_accuracy(svm_model, test_X, test_y)
svmpAcc <- get_accuracy(svm_modelPoly, test_X, test_y)
knnAcc <- get_accuracy(knn_model, test_X, test_y)
rfAcc <- get_accuracy(rf_model, test_X, test_y)
lrAcc <- get_accuracy(lr_model, test_X, test_y)
ldaAcc <- get_accuracy(lda_model, test_X, test_y)
glmnAcc <- get_accuracy(glmn_model, test_X, test_y)
nscAcc <- get_accuracy(nsc_model, test_X, test_y)

auc_df <- data.frame(
  Model = c("Neural Network", "MARS", "Support Vector Machine (svmRadial)", "Support Vector Machine (svmPoly)",
                               "K-Nearest Neighbor", "Random Forest", "Logistic Regression", "Linear Discriminant Analysis",
                                "Penalized Logistic Regression", "Nearest Shrunken Centroids"),
  
  AUC = c(nnetAuc, marsAuc, svmAuc, svmpAuc, knnAuc, rfAuc, lrAuc, ldaAuc, glmnAuc, nscAuc), 
  Accuracy = c(nnetAcc, marsAcc, svmAcc, svmpAcc, knnAcc, rfAcc, lrAcc, ldaAcc, glmnAcc, nscAcc)
)

print(auc_df)

# best model based on the AUC curve 
best_model <- auc_df[which.max(auc_df$AUC), ]

print(best_model)

```

## Checking the important variables of the optimal model 

```{r}
plot(varImp(rf_model, scale = FALSE), top = 10,
     main = "Important Factors for Predicting Cervical \nCancer using Random Forest")

```

## Recursive Feature Elimination (RFE)

```{r}
# use caret package & user-defined-function in Modeling.R to do recursive feature elimination
optimal_rf_features <- rf_rfe(train_X, train_y)
print(optimal_rf_features)
```


```{r, warning=FALSE}
# Retrain penalized LR with optimal features - 12 out of 15
train_X_rfe <- train_X[, optimal_rf_features]

rf_model_rfe <- rf_model_train(train_X_rfe, train_y, cntrl)
rf_model_rfe
```

```{r}
# Test new model
test_X_rfe <- test_X[, optimal_rf_features]

# get prediction result
testResults_rf_rfe <- get_prediction_results(rf_model_rfe, test_X_rfe, test_y)


testResults_rf_rfe$prediction <- ifelse(testResults_rf_rfe$prediction == "1", "Yes", "No")

# confusion matrix
cm <- confusionMatrix(as.factor(testResults_rf_rfe$prediction), as.factor(testResults_rf_rfe$observation))
print(cm)

# roc/auc result
roc_rf_rfe <- roc(testResults_rf_rfe$observation, testResults_rf_rfe$class_prob)
auc(roc_rf_rfe)
plot(roc_rf_rfe)
```

```{r}
# var importance of final glmnet model
plot(varImp(rf_model_rfe, scale = FALSE), top = 10,
     main = "Important Factors for Predicting Cervical Cancer\n using Random Forest")

```

## Threshold Investigation
```{r}
threshold_df <- thresholds_cm(testResults_rf)
print(threshold_df)
```