-
Notifications
You must be signed in to change notification settings - Fork 0
/
demo_Correlation Analysis.R
57 lines (41 loc) · 2.25 KB
/
demo_Correlation Analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
library(Hmisc)
library(corrplot)
# Correlation Analysis:
# Calculate the correlation coefficients between each collasped_compound
# Transform data frame for cor() function; file names as rows & collasped_compound as columns
my_data <- shared_comp_plastic_type %>%
dplyr::select(File, collapsed_compound, Percent_Area) %>%
# since we have duplicates with different values of the same compound in some samples, we summarize these values by taking the mean of them
group_by(File, collapsed_compound) %>%
summarise(across(Percent_Area, mean)) %>%
pivot_wider(names_from = collapsed_compound, values_from = Percent_Area) %>%
tibble::column_to_rownames(., var = "File")
# Fill NA
for (c in 1:ncol(my_data)) {
my_data[which(base::is.na(my_data[,c])),c] <- runif(length(which(base::is.na(my_data[,c]))),
min = sort(shared_comp_plastic_type$Percent_Area)[1],
max = sort(shared_comp_plastic_type$Percent_Area)[2])
}
# correlated variables ===============================================
res2 <- Hmisc::rcorr(as.matrix(my_data),
type = "spearman")
res3 <- Hmisc::rcorr(as.matrix(my_data),
type = "spearman")
# NOTE:: Select all combination of compounds that have positive correlation --------
corr_mat <- flattenCorrMatrix(res2$r, res2$P) %>% filter(., (cor > 0.8 & p < 0.05))
corr_mat_filled <- flattenCorrMatrix(res3$r, res3$P) %>% filter(., (cor > 0.8 & p < 0.05))
corrplot(cor(my_data), order = "hclust", tl.col = "black", tl.srt = 45)
View(as.matrix(stats::dist(as.matrix(my_data), method = "minkowski", upper = TRUE)))
# Check linear relationship between each predictor and response variables (aka. plastic_type) --------
cor.res <- list()
for (col in colnames(filled.data[, 2:ncol(filled.data)])) {
cor.res[paste0("plastic_type", "_", col)] <- stats::kruskal.test(filled.data[, col]~filled.data$plastic_type)$p.value
}
cor.resdf <- data.frame(sapply(cor.res,c))
colnames(cor.resdf) <- c("pvalue")
cor.resdf <- cor.resdf %>%
dplyr::arrange(pvalue) %>%
filter(pvalue < 0.05)
library(ggpubr)
ggpubr::ggboxplot(filled.data, x = "plastic_type", y = "Compound_867.",
color = "plastic_type")