-
Notifications
You must be signed in to change notification settings - Fork 0
/
benchmark_data_preprocess.Rmd
138 lines (124 loc) · 6.35 KB
/
benchmark_data_preprocess.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
---
title: "Validate the bias correction part of DCATS"
author: "Xinyi_Lin"
date: "07/11/2022"
output: html_document
---
```{r}
library(scater)
library(Seurat)
library(DCATS)
library(tidyverse)
```
## Try the Kaufmann2021 data
This data is from " Identifying CNS-colonizing T cells as potential therapeutic targets to prevent progression of multiple sclerosis"
```{r, eval=FALSE}
Kaufmann2021_cellMeta = read_csv("/storage/holab/linxy/DCATS/real_world/Kaufmann_2021/GSE144744_metadata_per_cell.csv")
Kaufmann2021_subset_meta = Kaufmann2021_cellMeta %>% filter(cluster_names %in% c("T06", "T07", "T09", "T10", "T11", "T02", "T08"))
# Kaufmann2021_subset_meta = Kaufmann2021_cellMeta %>% filter(cluster_names %in% c("T06", "T07", "T09", "T10", "T11", "T02", "T03"))
rm(Kaufmann2021_cellMeta)
Kaufmann2021_mtx = Read10X("/storage/holab/linxy/DCATS/real_world/Kaufmann_2021/RNA_integrated")
Kaufmann2021_subset_mtx = Kaufmann2021_mtx[,Kaufmann2021_subset_meta$cell_names]
save(Kaufmann2021_subset_meta, Kaufmann2021_subset_mtx, file = "/storage/holab/linxy/DCATS/real_world/Kaufmann_2021/subste_Tcells.RData")
```
```{r}
load("/storage/holab/linxy/DCATS/real_world/Kaufmann_2021/subste_Tcells.RData")
Kaufmann2021_subset_seurat = CreateSeuratObject(Kaufmann2021_subset_mtx)
Kaufmann2021_subset_meta = Kaufmann2021_subset_meta %>%
column_to_rownames('cell_names')
Kaufmann2021_subset_meta = Kaufmann2021_subset_meta[Cells(Kaufmann2021_subset_seurat),]
Kaufmann2021_subset_seurat$clusterRes = Kaufmann2021_subset_meta$cluster_names
```
Select cell with proportion difference T06 and T09, and the surrounding cell types
```{r}
Idents(Kaufmann2021_subset_seurat) = 'clusterRes'
Kaufmann2021_subset2 <- subset(Kaufmann2021_subset_seurat, ident = c("T06", "T07", "T09", "T10"))
Kaufmann2021_subset2 <- FindVariableFeatures(Kaufmann2021_subset2, selection.method = "vst", nfeatures = 2000)
Kaufmann2021_subset2 <- ScaleData(Kaufmann2021_subset2, verbose = FALSE)
Kaufmann2021_subset2 <- RunPCA(Kaufmann2021_subset2, npcs = 30, verbose = FALSE)
Kaufmann2021_subset2 <- RunUMAP(Kaufmann2021_subset2, reduction = "pca", dims = 1:30)
Kaufmann2021_subset2 <- FindNeighbors(Kaufmann2021_subset2, reduction = "pca", dims = 1:30)
Kaufmann2021_subset2 <- FindClusters(Kaufmann2021_subset2, resolution = 0.1)
DimPlot(Kaufmann2021_subset2)
DimPlot(Kaufmann2021_subset2, group.by = 'clusterRes')
```
```{r}
#save(Kaufmann2021_subset2, file = "/storage/holab/linxy/DCATS/real_world/Kaufmann_2021/further_subset.RData")
save(Kaufmann2021_subset_seurat, file = "/storage/holab/linxy/DCATS/real_world/Kaufmann_2021/subset_seurat.RData")
```
```{r}
load("/storage/holab/linxy/DCATS/real_world/Kaufmann_2021/further_subset.RData")
Idents(Kaufmann2021_subset_seurat) = 'clusterRes'
conf.mat <- table(Kaufmann2021_subset2$clusterRes, Kaufmann2021_subset2$seurat_clusters)
#true.conf <- conf.mat/rowSums(conf.mat)
#library(clue)
#order2 = as.vector(solve_LSAP(true.conf, maximum = TRUE))
#simil_matT = true.conf[,order2]
simil_matT = true.conf[apply(true.conf, 2, which.max),] ## change order
print("----simil_matT----")
print(simil_matT)
true_proportion = table(Kaufmann2021_subset2$clusterRes)/length(Cells(Kaufmann2021_subset2))
cur_proportion = table(Kaufmann2021_subset2$seurat_clusters)/length(Cells(Kaufmann2021_subset2))
bs_proportion = multinom_EM(cur_proportion, simil_matT, verbose = FALSE)$mu
names(cur_proportion) = rownames(simil_matT)
names(bs_proportion) = rownames(simil_matT)
print("----true proportion----")
print(true_proportion)
print("----cur proportion----")
print(cur_proportion)
print("----bs proportion----")
print(bs_proportion)
rmse_cur = abs(cur_proportion[names(true_proportion)] - true_proportion) %>% sum()
rmse_bs = abs(bs_proportion[names(true_proportion)] - true_proportion) %>% sum()
print(rmse_cur)
print(rmse_bs)
```
```{r}
true_proportion = table(Kaufmann2021_subset2$clusterRes)/length(Cells(Kaufmann2021_subset2))
cur_proportion = table(Kaufmann2021_subset2$seurat_clusters)/length(Cells(Kaufmann2021_subset2))
## true mat
conf.mat <- table(Kaufmann2021_subset2$clusterRes, Kaufmann2021_subset2$seurat_clusters)
true.conf <- conf.mat/rowSums(conf.mat)
true.conf = true.conf[apply(true.conf, 2, which.max),] ## change order
print("-------true.conf--------")
print(true.conf)
## knn mat
graphs = Kaufmann2021_subset2@graphs$RNA_snn
labels = Idents(Kaufmann2021_subset2)
knn_mat = knn_simMat(graphs, labels)
knn_mat = knn_mat[colnames(true.conf), colnames(true.conf)]
print("-------knn_mat--------")
print(knn_mat)
## svm mat
svmDF = Kaufmann2021_subset2@reductions$pca@cell.embeddings %>%
as.data.frame() %>%
mutate(clusterRes = Kaufmann2021_subset2$seurat_clusters)
simil_matSVM = svm_simMat(svmDF[sample(1:nrow(svmDF), size = 9000),])
#simil_matSVM = cbind(simil_matSVM, rep(0,5))
#colnames(simil_matSVM) = c("0", "1", "2", "3", "4")
## evaluation
cur_proportion = table(Kaufmann2021_subset2$seurat_clusters)/length(Cells(Kaufmann2021_subset2))
names(cur_proportion) = rownames(true.conf)
rmse_cur = abs(cur_proportion[names(true_proportion)] - true_proportion) %>% sum()
print(rmse_cur)
bs_true_proportion = multinom_EM(cur_proportion, true.conf, verbose = FALSE)$mu
names(bs_true_proportion) = rownames(true.conf)
rmse_true_bs = abs(bs_true_proportion[names(true_proportion)] - true_proportion) %>% sum()
print(rmse_true_bs)
bs_knn_proportion = multinom_EM(cur_proportion, knn_mat, verbose = FALSE)$mu
names(bs_knn_proportion) = rownames(true.conf)
rmse_knn_bs = abs(bs_knn_proportion[names(true_proportion)] - true_proportion) %>% sum()
print(rmse_knn_bs)
bs_svm_proportion = multinom_EM(cur_proportion, simil_matSVM, verbose = FALSE)$mu
names(bs_svm_proportion) = rownames(true.conf)
rmse_svm_bs = abs(bs_svm_proportion[names(true_proportion)] - true_proportion) %>% sum()
print(rmse_svm_bs)
proportionDF = data.frame(true_proportion = as.vector(true_proportion), cur_proportion = as.vector(cur_proportion), bs_true_proportion, bs_knn_proportion, bs_svm_proportion)
print(proportionDF)
rmseL = list(rmse_cur = rmse_cur, rmse_true_bs = rmse_true_bs, rmse_svm_bs = rmse_svm_bs, rmse_knn_bs = rmse_knn_bs)
similL = list(true_mat = true.conf, knn_mat = knn_mat, svm_mat = simil_matSVM)
Kaufmann2021L = list(proportionDF = proportionDF, rmseL = rmseL, similL = similL)
```
```{r}
save(Kaufmann2021L, file = "/storage/holab/linxy/DCATS/real_world/Kaufmann_2021/Kaufmann2021Res.RData")
```