-
Notifications
You must be signed in to change notification settings - Fork 0
/
code.Rmd
519 lines (369 loc) · 15 KB
/
code.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
---
title: "US chronic Data analysis"
output: pdf_document
date: "2023-12-05"
---
```{r}
# Load libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(caret)
library(glmnet)
library(MASS)
library(e1071)
# Load the dataset
# Set the working directory
setwd("C:/Users/USER/Downloads")
# Check the current working directory
getwd()
df <- read.csv("U.S._Chronic_Disease_Indicators__CDI_.csv")
head(df)
```
```{r}
# Initial data inspection
print(head(df))
print(str(df))
```
#Exploratory Data Analysis (EDA)
```{r}
# Check missing values
print(colSums(is.na(df)))
# Summary statistics
print("Dataset Summary statistics:")
print(summary(df))
# Explore unique values in categorical columns
print("Unique values in LocationAbbr:")
print(unique(df$LocationAbbr))
print("Unique values in Topic:")
print(unique(df$Topic))
```
```{r}
# Convert YearStart and YearEnd to datetime
df$YearStart <- as.Date(df$YearStart, format='%Y')
df$YearEnd <- as.Date(df$YearEnd, format='%Y')
# Convert DataValue to numeric (handling errors with 'na.action' to convert non-numeric values to NA)
df$DataValue <- as.numeric(as.character(df$DataValue))
# Check data types again
print(str(df))
# Check missing values
print(colSums(is.na(df)))
# Explore unique values in categorical columns
print("Unique values in LocationAbbr:")
print(unique(df$LocationAbbr))
print("Unique values in Topic:")
print(unique(df$Topic))
# Frequency distribution of categorical variables
barplot(table(df$LocationAbbr), main='Frequency Distribution of LocationAbbr')
# Time trends for categorical variables
topic_counts <- table(df$YearStart, df$Topic)
barplot(t(topic_counts), beside=TRUE, legend=rownames(topic_counts),
main='Topic Counts Over Time', col=rainbow(ncol(topic_counts)))
# Distribution of numeric variable
hist(df$DataValue, breaks=20, main='Distribution of DataValue')
# Remove missing or infinite values
df_cleaned <- df[is.finite(df$DataValue), ]
# Stratified analysis
strat_means <- tapply(df_cleaned$DataValue, df_cleaned$StratificationCategoryID1, mean)
# Create a bar plot
barplot(strat_means, main='Mean DataValue by Stratification', xlab='StratificationCategoryID1', ylab='Mean DataValue')
```
#Linear Regression
```{r}
# Select relevant columns for Linear Regression
selected_columns <- c('DataValue', 'LowConfidenceLimit', 'HighConfidenceLimit')
# Drop rows with missing values in the selected columns
df_selected <- na.omit(df[selected_columns])
# Split the data into features and target
X <- df_selected[c('LowConfidenceLimit', 'HighConfidenceLimit')]
y <- df_selected$DataValue
# Split into training and testing sets
set.seed(42) # Set seed for reproducibility
split_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[split_index, ]
y_train <- y[split_index]
X_test <- X[-split_index, ]
y_test <- y[-split_index]
# Create a Linear Regression model
lr_model <- lm(DataValue ~ LowConfidenceLimit + HighConfidenceLimit, data = df_selected)
# Make predictions
y_pred <- predict(lr_model, newdata = X_test)
# Evaluate the model
mse <- mean((y_test - y_pred)^2)
cat("Linear Regression MSE:", mse, "\n")
# Visualize predictions vs actual values
plot(X_test$LowConfidenceLimit, y_test, col='black', pch=16, xlab='Low Confidence Limit', ylab='DataValue', main='Linear Regression Predictions vs Actual Values')
points(X_test$LowConfidenceLimit, y_pred, col='red', pch=16)
legend("topright", legend=c('Actual Values', 'Predicted Values'), col=c('black', 'red'), pch=16)
```
#Model Training and Validation with Cross-Validation:
```{r}
# Load library for cross-validation
library(caret)
# Cross-validation for Linear Regression
lr_model_cv <- train(DataValue ~ LowConfidenceLimit + HighConfidenceLimit,
data = df_selected,
method = "lm",
trControl = trainControl(method = "cv", number = 5))
# Extract cross-validated MSE
cv_mse <- -lr_model_cv$results$RMSE[1]
cat("Cross-validated MSE:", cv_mse, "\n")
# Calculate RMSE
rmse <- sqrt(mse)
cat("RMSE:", rmse, "\n")
```
# Model Deployement
```{r}
# Save trained linear regression model
saveRDS(lr_model, "linear_regression_model.rds")
```
```{r}
# Load the saved model
loaded_model <- readRDS("linear_regression_model.rds")
```
```{r}
# Simulate new data
new_data <- data.frame(
LowConfidenceLimit = c(10, 15, 20),
HighConfidenceLimit = c(15, 20, 25)
)
# Make predictions on the new data
new_predictions <- predict(loaded_model, newdata = new_data)
# Display the predictions
print(new_predictions)
```
# Shrinkage Methods (Lasso, Ridge):
```{r}
# Load libraries
library(glmnet)
# Standardize features
X_train_scaled <- scale(X_train)
# Lasso Regression
lasso_model <- cv.glmnet(X_train_scaled, y_train, alpha = 1)
# Ridge Regression
ridge_model <- cv.glmnet(X_train_scaled, y_train, alpha = 0)
```
```{r}
# Load libraries
library(caret)
library(glmnet)
# Standardize features
X_train_scaled <- scale(X_train)
# Create a training control with 5-fold cross-validation
train_control <- trainControl(method = "cv", number = 5)
# Lasso Regression using cross-validation
lasso_model <- train(
x = X_train_scaled,
y = y_train,
method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(alpha = 1, lambda = seq(0.01, 1, length = 100))
)
# Ridge Regression using cross-validation
ridge_model <- train(
x = X_train_scaled,
y = y_train,
method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(alpha = 0, lambda = seq(0.01, 1, length = 100))
)
# View the results
print(lasso_model)
print(ridge_model)
```
```{r}
# Check data types again
print(str(df))
```
# LDA
```{r}
library(MASS)
# Select relevant columns for LDA
lda_data <- df[, c("DataValue", "LocationID", "YearStart")]
# Remove missing values
lda_data <- na.omit(lda_data)
# Define the formula (response ~ predictors)
formula <- LocationID ~ DataValue + YearStart
# Fit LDA model
lda_model <- lda(formula, data = lda_data)
# Summary of LDA
summary(lda_model)
```
```{r}
# Create a factor for LocationID
lda_data$LocationID <- as.factor(lda_data$LocationID)
# Define the formula (response ~ predictors)
formula <- LocationID ~ DataValue + YearStart
# Create training control with 10-fold cross-validation
ctrl <- trainControl(method = "cv", number = 10)
# Train the LDA model using cross-validation
lda_model_cv <- train(formula, data = lda_data, method = "lda", trControl = ctrl)
# Print the cross-validated results
print(lda_model_cv)
```
```{r}
# Select relevant columns for Logistic Regression
logreg_data <- df[, c("DataValue", "LocationID", "YearStart")]
# Remove missing values
logreg_data <- na.omit(logreg_data)
# Convert LocationID to a factor
logreg_data$LocationID <- as.factor(logreg_data$LocationID)
# Define the binary outcome variable
logreg_data$BinaryOutcome <- as.factor(ifelse(logreg_data$LocationID == "target_location", 1, 0))
set.seed(123) # Set a seed for reproducibility
sampled_data <- logreg_data[sample(nrow(logreg_data), 10000), ]
# Define the formula with the new binary outcome variable
formula <- BinaryOutcome ~ DataValue + YearStart
# Create training control with 10-fold cross-validation
ctrl <- trainControl(method = "cv", number = 10)
# Train the Logistic Regression model using cross-validation
logreg_model_cv <- train(formula, data = sampled_data, method = "glm", family = "binomial", trControl = ctrl)
# Print the cross-validated results
print(logreg_model_cv)
# Create a small example of new data
new_data <- data.frame(
DataValue = c(500, 600, 700),
YearStart = as.Date(c("2023-01-01", "2023-02-01", "2023-03-01"))
)
# Model deployment on new data
new_data_logreg <- predict(logreg_model_cv, newdata = new_data)
print(new_data_logreg)
```
```{r}
accuracy_values <- logreg_model_cv$results$Accuracy
mean_accuracy <- mean(accuracy_values, na.rm = TRUE)
print(paste("Mean Accuracy:", round(mean_accuracy, 4)))
```
# Nonparametric Logistic Regression
```{r}
# Select relevant columns for Nonparametric Logistic Regression
nonparam_data <- df[, c("DataValue", "LocationID", "YearStart")]
# Remove missing values
nonparam_data <- na.omit(nonparam_data)
# Convert LocationID to a factor
nonparam_data$LocationID <- as.factor(nonparam_data$LocationID)
# Define the binary outcome variable
nonparam_data$BinaryOutcome <- as.factor(ifelse(nonparam_data$LocationID == "target_location", 1, 0))
# Randomly sample 100 rows
set.seed(123) # Set a seed for reproducibility
sampled_nonparam_data <- nonparam_data[sample(nrow(nonparam_data), 100), ]
# Fit the Nonparametric Logistic Regression model (GAM) with splines
library(splines)
library(mgcv)
# Define the formula with the new binary outcome variable and splines
formula <- BinaryOutcome ~ ns(DataValue, df = 3) + ns(YearStart, df = 3)
# Fit the GAM model
nonparam_model <- gam(formula, data = sampled_nonparam_data, family = binomial())
# Print the summary of the model
summary(nonparam_model)
```
```{r}
# Predict with the Nonparametric Logistic Regression model
predictions <- predict(nonparam_model, newdata = sampled_nonparam_data, type = "response")
# Convert predicted probabilities to class predictions
predicted_classes <- ifelse(predictions > 0.5, 1, 0)
# Create a confusion matrix
conf_matrix <- table(Actual = sampled_nonparam_data$BinaryOutcome, Predicted = predicted_classes)
# Calculate accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", round(accuracy, 4)))
```
# Kernel Methods (Support Vector Machines - SVMs
```{r}
# Load the e1071 package
library(e1071)
# Select relevant columns for SVM
svm_data <- df[, c("DataValue", "YearStart", "LocationID")]
# Remove missing values
svm_data <- na.omit(svm_data)
# Convert LocationID to a factor
svm_data$BinaryOutcome <- as.factor(ifelse(svm_data$LocationID %in% c(5, 8, 11, 13, 26, 41, 55), 1, 0))
0
set.seed(123) # Set a seed for reproducibility
sampled_svm_data <- svm_data[sample(nrow(svm_data), 10000), ]
# Fit the SVM model with radial basis kernel
svm_model <- svm(BinaryOutcome ~ DataValue + YearStart, data = sampled_svm_data, kernel = "radial")
# Print the SVM model
print(svm_model)
# Make predictions on new data
new_data_svm <- predict(svm_model, newdata = new_data)
# The result will contain the predicted class and other information
print(new_data_svm)
```
```{r}
# Placeholder for true labels
true_labels <- rep(0, nrow(new_data))
# Convert predicted classes to factor
predicted_classes <- as.factor(new_data_svm)
# Calculate accuracy
accuracy <- mean(predicted_classes == true_labels)
print(paste("Accuracy:", round(accuracy, 4)))
```
```{r}
# Linear Regression
cat("Linear Regression MSE:", mse, "\n")
# Cross-validated Linear Regression
cat("Cross-validated MSE:", cv_mse, "\n")
# Lasso Regression
cat("Lasso Regression RMSE:", sqrt(lasso_model$results$RMSE[1]), "\n")
# Ridge Regression
cat("Ridge Regression RMSE:", sqrt(ridge_model$results$RMSE[1]), "\n")
# LDA (Linear Discriminant Analysis)
print("LDA Model Accuracy:")
print(lda_model_cv$results$Accuracy)
# Logistic Regression
print("Logistic Regression Model Accuracy:")
print(logreg_model_cv$results$Accuracy)
# Nonparametric Logistic Regression
cat("Nonparametric Logistic Regression Accuracy:", round(accuracy, 4), "\n")
# SVM (Support Vector Machines)
print("SVM Model Accuracy:")
print(sum(predicted_classes == true_labels) / length(true_labels))
```
##Linear Regression MSE: 0.8561572
Explanation: Mean Squared Error (MSE) is a measure of the average squared difference between the predicted and actual values. In this case, the linear regression model has an MSE of approximately 0.856, which indicates the average squared difference between the predicted and actual values for the response variable (DataValue). Lower MSE values are desirable, suggesting better model performance.
Cross-validated MSE: -0.905324
Explanation: Cross-validated MSE is the Mean Squared Error calculated using cross-validation. The negative sign might be an error or an output format specific to the caret package. The magnitude is important, and lower values still indicate better performance.
##Lasso Regression RMSE: 1.69658
Explanation: Root Mean Squared Error (RMSE) is similar to MSE but is in the same units as the response variable. The Lasso Regression model has an RMSE of approximately 1.697, indicating the average difference between the predicted and actual values. Lower RMSE values are preferred.
Ridge Regression RMSE: 2.142513
Explanation: Similar to Lasso, Ridge Regression's RMSE is approximately 2.143. Comparing this with the Lasso RMSE, we can see which regularization technique performed better. Again, lower values are better.
##LDA Model Accuracy: 0.02366583
Explanation: The Linear Discriminant Analysis (LDA) model has an accuracy of approximately 0.024. Accuracy is the proportion of correctly classified instances. In this case, the model correctly classifies around 2.4% of instances.
## Logistic Regression Model Accuracy: 1
Explanation: The Logistic Regression model has an accuracy of 1, which means it correctly classifies all instances. An accuracy of 1 is perfect.
Explanation: The Nonparametric Logistic Regression (GAM) model has an accuracy of 1, indicating perfect classification of instances.
Explanation: The Support Vector Machines (SVM) model has an accuracy of 1, indicating perfect classification.
```{r}
# Linear Regression
# Response variable: DataValue
# Predictors: LowConfidenceLimit, HighConfidenceLimit
# LDA (Linear Discriminant Analysis)
# Response variable: LocationID
# Predictors: DataValue, YearStart
# Logistic Regression
# Response variable: BinaryOutcome
# Predictors: DataValue, YearStart
# Nonparametric Logistic Regression
# Response variable: BinaryOutcome
# Predictors: DataValue, YearStart
# SVM (Support Vector Machines)
# Response variable: BinaryOutcome
# Predictors: DataValue, YearStart
```
```{r}
# Create a data frame with model names and corresponding metrics
models <- c("Linear Regression", "Cross-validated Linear Regression", "Lasso Regression", "Ridge Regression",
"LDA Model", "Logistic Regression", "Nonparametric Logistic Regression", "SVM Model")
metrics <- c(mse, cv_mse, sqrt(lasso_model$results$RMSE[1]), sqrt(ridge_model$results$RMSE[1]),
lda_model_cv$results$Accuracy, logreg_model_cv$results$Accuracy, accuracy, sum(predicted_classes == true_labels) / length(true_labels))
metrics_df <- data.frame(Model = models, Metric = metrics)
# Plot the metrics
library(ggplot2)
ggplot(metrics_df, aes(x = Model, y = Metric, fill = Model)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Model Performance Metrics",
y = "Metric Value") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
```