-
Notifications
You must be signed in to change notification settings - Fork 4
/
dtw_clustering.R
89 lines (85 loc) · 3.42 KB
/
dtw_clustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#####################################################################################
dtw_clustering <- function(clustering_data,threshold)
{
#perform clustering using dtwclust
#check that the threshold is below 1
# if it is not, set it to be 1
if (threshold > 1){
threshold = 1
}
mydata = clustering_data
normmatrix <- (mydata[,]-rowMeans(mydata)*matrix(1,nrow=dim(mydata)[1],ncol=dim(mydata)[2]))/
apply(mydata[,],1,sd)
normmatrix[is.na(rowSums(normmatrix)),] = rep(1,dim(normmatrix)[2])
#supervised clustering using dtw
clusters <- matrix(0,nrow=dim(normmatrix)[1],ncol=1)
ind=1
#for each gene, compare its profile to the rest of the genes
#any gene with a sufficiently matching profile is clustered together
for (j in 1:(dim(normmatrix)[1])){
print(paste('Clustering gene ',toString(j),sep=""))
#first check if gene j has already been clustered
#if it has, we skip it
#if it hasn't, we initialize the cluster #
if (clusters[j]!=0){
next
}else{
clusters[j]=ind
ind = ind+1
}
for(k in (j+1):dim(normmatrix)[1]-1){
#first check if gene k has already been clustered
#if it has, we skip it
if (clusters[k] != 0){
next
}
#otherwise perform dtw on the two genes
#DTW expects the time series as a vector
ts1 = as.vector(t(normmatrix[j,]))
ts2 = as.vector(t(normmatrix[k,]))
#perform dtw
dtwresults = dtw(ts1,ts2)
#DTW gives us the indices that matched
#if the time series matches perfectly, then x1=y1, x2=y2, etc.
#we can test this by calculating the proportion of matches
xinds = dtwresults$index1
yinds = dtwresults$index2
propmatches= sum(xinds==yinds)/length(xinds)
#if propmatches>= threshold, cluster gene k with gene j
#otherwise, move on
if (propmatches>=threshold){
clusters[k]=clusters[j]
}
}
}
#put together gene names, normalized expression, and clusters
results <- data.frame(normmatrix,clusters,row.names=row.names(mydata))
# #prepare data for plotting
# plotdata <- as.data.frame(t(results[,1:dim(normmatrix)[2]]))
# stacked <- stack(plotdata)
# stacked[,3] = rep(colnames(results)[1:dim(normmatrix)[2]],dim(plotdata)[2])
# stacked[,4] = rep(clusters,each=dim(normmatrix)[2])
# colnames(stacked) <- c('Norm.Intensity','gene','group','cluster')
# #add means of each cluster as reference lines
# reflines = by(results[,1:dim(normmatrix)[2]], results$clusters, colMeans)
# reflinedata <- do.call(cbind, reflines)
# reflinedata <- as.data.frame(reflinedata)
# reflinestacked <- stack(reflinedata)
# reflinestacked[,2] <- rep(colnames(results)[1:dim(normmatrix)[2]],max(clusters))
# reflinestacked[,3] <- rep(1:max(clusters),each=dim(normmatrix)[2])
# colnames(reflinestacked) <- c('Norm.Intensity','group','cluster')
# #plot one cluster per file
# for (i in 1:max(results$clusters)){
# g <- ggplot(data = stacked[stacked$cluster==i,],
# mapping = aes(x=group, y=Norm.Intensity, colour = as.factor(cluster), group=gene)) +
# geom_line() + theme(legend.position="none")+
# scale_x_discrete(limits = unique(stacked$group), labels=colnames(mydata))
# g <- g + geom_line(data = reflinestacked[reflinestacked$cluster==i,],
# aes(x = group, y = Norm.Intensity, group = cluster), colour='BLACK')
# ggplotly(g)
# ggsave(paste('Cluster Plots/cluster', i, '.png', sep=""),device="png",width=5, height=3,dpi=600)
# }
#finally, save the final data table
write.csv(results,'clusters.csv')
return(results)
}