-
Notifications
You must be signed in to change notification settings - Fork 0
/
cluster_result_analyser.R
166 lines (131 loc) · 5.85 KB
/
cluster_result_analyser.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# read input argument
options(echo = TRUE)
args <- commandArgs(trailingOnly = TRUE)
print(args)
# read the first argument as the input file
cluster_results <- read.csv2(args[1], sep="\t", na.strings="NA", stringsAsFactors=FALSE, fill=TRUE)
# read the output directory
output_file <- args[2]
# remove input arguments
rm(args)
# formatting the columns
cluster_results$AVG_PMZ <- as.numeric(as.character(cluster_results$AVG_PMZ))
cluster_results$AVG_PMZ_HIGHEST <- as.numeric(as.character(cluster_results$AVG_PMZ_HIGHEST))
cluster_results$MAX_PMZ <- as.numeric(as.character(cluster_results$MAX_PMZ))
cluster_results$MAX_PMZ <- as.numeric(as.character(cluster_results$MAX_PMZ))
cluster_results$MIN_PMZ <- as.numeric(as.character(cluster_results$MIN_PMZ))
cluster_results$PMZ_RANGE <- as.numeric(as.character(cluster_results$PMZ_RANGE))
cluster_results$MAX_PMZ_HIGHEST <- as.numeric(as.character(cluster_results$MAX_PMZ_HIGHEST))
cluster_results$MIN_PMZ_HIGHEST <- as.numeric(as.character(cluster_results$MIN_PMZ_HIGHEST))
cluster_results$PMZ_RANGE_HIGHEST <- as.numeric(as.character(cluster_results$PMZ_RANGE_HIGHEST))
cluster_results$MAX_RATIO <- as.numeric(as.character(cluster_results$MAX_RATIO))
# Create a high quality subset
high_quality_cluster_results <- cluster_results[cluster_results$MAX_RATIO>=0.7 & cluster_results$NUM_PROJECTS>=2 & cluster_results$NUM_SPECTRA >= 10,]
# plotting
library(ggplot2)
library(gridExtra)
# define output
pdf(output_file)
# plot the ratio vs size (min 10 spectra)
ratio_size <- ggplot(cluster_results, aes(x=NUM_SPECTRA, y=MAX_RATIO)) +
geom_point(shape=1) +
ggtitle("All Clusters") +
xlab("Number of Spectra") +
ylab("Max Ratio")
# plot the ratio distribution in density histogram
ratio_size_density <- ggplot(cluster_results, aes(x=MAX_RATIO)) +
geom_histogram(aes(y=..density..), binwidth=.01, colour="black", fill="white") +
geom_density(fill="#FF6666", alpha=.3) +
ggtitle("All Clusters") +
xlab("Max Ratio") +
ylab("Density")
ratio_size_high <- ggplot(high_quality_cluster_results, aes(x=NUM_SPECTRA, y=MAX_RATIO)) +
geom_point(shape=1) +
ggtitle("High Quality Clusters") +
xlab("Number of Spectra") +
ylab("Max Ratio")
# plot the ratio distribution in density histogram
ratio_size_density_high <- ggplot(high_quality_cluster_results, aes(x=MAX_RATIO)) +
geom_histogram(aes(y=..density..), binwidth=.01, colour="black", fill="white") +
geom_density(fill="#FF6666", alpha=.3) +
ggtitle("High Quality Clusters") +
xlab("Max Ratio") +
ylab("Density")
grid.arrange(ratio_size, ratio_size_high, ratio_size_density, ratio_size_density_high, ncol=2)
# plot the size distribution
cluster_size <- ggplot(cluster_results, aes(x=NUM_SPECTRA)) +
geom_density(fill="#FF6666", alpha=.3) +
ggtitle("All Clusters") +
xlab("Number Of Spectra") +
ylab("Density")
cluster_size_box <- ggplot(cluster_results, aes(x=factor(0), y = NUM_SPECTRA)) +
geom_boxplot() +
ggtitle("All Clusters") +
xlab("") +
ylab("Number Of Spectra")
# plot the size distribution for high quality cluster
cluster_size_high <- ggplot(high_quality_cluster_results, aes(x=NUM_SPECTRA)) +
geom_density(fill="#FF6666", alpha=.3) +
ggtitle("High Quality Clusters") +
xlab("Number Of Spectra") +
ylab("Density")
cluster_size_box_high <- ggplot(high_quality_cluster_results, aes(x=factor(0), y = NUM_SPECTRA)) +
geom_boxplot() +
ggtitle("High Quality Clusters") +
xlab("") +
ylab("Number Of Spectra")
grid.arrange(cluster_size, cluster_size_high, cluster_size_box, cluster_size_box_high, ncol=2)
# plot the precursor m/z range
mz_range <- ggplot(cluster_results, aes(x=NUM_SPECTRA, y=PMZ_RANGE)) +
geom_point(shape=1) +
ggtitle("All Clusters") +
xlab("Number of Spectra") +
ylab("Precursor m/z Range")
# plot the precursor m/z range for high quality cluster
mz_range_high <- ggplot(high_quality_cluster_results, aes(x=NUM_SPECTRA, y=PMZ_RANGE)) +
geom_point(shape=1) +
ggtitle("High Quality Clusters") +
xlab("Number of Spectra") +
ylab("Precursor m/z Range")
grid.arrange(mz_range, mz_range_high, ncol=2)
# plot the number of projects
spectra_projects <- ggplot(cluster_results, aes(x=NUM_SPECTRA, y=NUM_PROJECTS)) +
geom_point(shape=1) +
ggtitle("All Clusters") +
xlab("Number of Spectra") +
ylab("NUmber of Projects")
spectra_projects_high <- ggplot(high_quality_cluster_results, aes(x=NUM_SPECTRA, y=NUM_PROJECTS)) +
geom_point(shape=1) +
ggtitle("High Quality Clusters") +
xlab("Number of Spectra") +
ylab("NUmber of Projects")
grid.arrange(spectra_projects, spectra_projects_high, ncol=2)
# plot the species
species <- ggplot(cluster_results, aes(x=NUM_SPECIES)) +
geom_histogram(aes(y=..density..), binwidth=1, colour="black", fill="white") +
geom_density(fill="#FF6666", alpha=.3) +
ggtitle("All Clusters") +
xlab("Number Of Species") +
ylab("Density")
species_high <- ggplot(high_quality_cluster_results, aes(x=NUM_SPECIES)) +
geom_histogram(aes(y=..density..), binwidth=1, colour="black", fill="white") +
geom_density(fill="#FF6666", alpha=.3) +
ggtitle("High Quality Clusters") +
xlab("Number Of Species") +
ylab("Density")
grid.arrange(species, species_high, ncol=2)
# plot peptide and psms
peptide <- ggplot(cluster_results, aes(x=NUM_PEPTIDES)) +
geom_histogram(aes(y=..density..), binwidth=1, colour="black", fill="white") +
geom_density(fill="#FF6666", alpha=.3) +
ggtitle("All Clusters") +
xlab("Number Of Peptides") +
ylab("Density")
peptide_high <- ggplot(high_quality_cluster_results, aes(x=NUM_PEPTIDES)) +
geom_histogram(aes(y=..density..), binwidth=1, colour="black", fill="white") +
geom_density(fill="#FF6666", alpha=.3) +
ggtitle("High Quality Clusters") +
xlab("Number Of Peptides") +
ylab("Density")
grid.arrange(peptide, peptide_high, ncol=2)
dev.off()