-
Notifications
You must be signed in to change notification settings - Fork 2
/
decorrelate.r
27 lines (24 loc) · 905 Bytes
/
decorrelate.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
decorrelate = function(data, k1, k2)
# input:
# data: data frame with the first column as the response and the rest are the features
# k1 the dimension for MDS
# k2 the dimension for Kmean
# output:
# list of 3: data frame, mds fit, kmean fit
{
df = cor(data[ ,sapply(data, is.numeric)], use = "complete")
dd = df[rowSums(is.na(df)) != ncol(df)-1, colSums(is.na(df)) != nrow(df)-1]
dd = dd[-1, -1]
fit = cmdscale(1-abs(dd),eig=TRUE, k1) # number here is determined by fit results
cl = kmeans(fit$points, k2)
cl$cluster
r_info = df[1, -1]
r_info = r_info[!is.na(r_info)]
select_name ='response'
include_name = ''
for (i in 1:k2){
select_name = c(select_name, names(which.max(abs(r_info[cl$cluster == i]))))
}
target_all1 = cbind(data[, select_name, drop = FALSE], data[, !sapply(data, is.numeric), drop = FALSE])
return (list(target_all1, fit, cl))
}