-
Notifications
You must be signed in to change notification settings - Fork 3
/
REVISION_Pluritest.Rmd
170 lines (126 loc) · 6.39 KB
/
REVISION_Pluritest.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
---
title: "PluriTest of isogenic hiPSC and hESC"
author: "Sam Buckberry"
date: "`r Sys.Date()`"
output: html_document
---
Load the script that contains functions used in the project
```{r, message=FALSE, warning=FALSE}
source("R/project_functions.R")
```
## Load the RNA-seq count data and associated sample meta-data
Read the sample count tables generated with `TEcount` and combine into matrix
```{r, cache=TRUE}
sample_dat <- fread("RNAseq/polyA/polyA_RNAseq_sample_table.tsv")
sample_dat$model_group <- str_c(sample_dat$Group, sample_dat$Donor, sep = "_")
# List the counts tables for all samples
cnt_files <- list.files(path = "RNAseq/polyA/MEL1_data/", full.names = TRUE, pattern = ".cntTable")
# Select the donor datasets
cnt_files <- cnt_files[grepl(pattern = "MEL1", cnt_files)]
dat2 <- lapply(cnt_files, read.table, header = TRUE, row.names = 1) %>% do.call(cbind, .)
libs <- colnames(dat2) %>%
str_remove("X.home.sbuckberry.working_data_02.polo_project.human_ips.RNAseq.polyA.aligned_data.") %>%
str_sub(start = 1, end = 6)
colnames(dat2) <- libs
sample_dat$group <- str_c(sample_dat$State, sample_dat$Media, sep = "_")
sample_dat$sample_id <- str_c(sample_dat$Timepoint, sample_dat$Donor, sample_dat$Media, sep = "_")
sample_dat$sample_id[duplicated(sample_dat$sample_id)] <- str_c(sample_dat$sample_id[duplicated(sample_dat$sample_id)], "_1")
sample_dat$model_group <- str_c(sample_dat$State, sample_dat$Media, sep = "_")
pDat <- sample_dat[match(colnames(dat2), sample_dat$Library), ]
# Is the meta data and expression data in the same order?
all(pDat$Library == colnames(dat2))
y <- DGEList(counts = dat2)
y$samples <- cbind(y$samples, pDat)
```
# Pluritest
Code from https://github.com/pluritest/pluritestCompared
```{r, cache=TRUE}
## Load the pluritest data from the manuscript
load("pluritestCompared/working/NMETH-BC11016B-pluritestsub.rdata")
featureNames(H9targetArray)<-fData(H9targetArray)[,1] # required for R3.x compatibility
unzip(zipfile = "pluritestCompared/working/forLumi.txt.zip",exdir = "./working/")
# definitions for plots and computations
WA09<-c("WA09_r1_HS","WA09_r2_HS","WA09_r1_OO","WA09_r1_TEL","WA09_r2_OO","WA09_r2_TEL","WA09_r1_AE","WA09_r2_AE")
UKSCB<-c( "Shef3_r1_OO","Shef3_r2_OO","Oxford2_r1_OO","Oxford2_r2_OO","I9_r1_OO","I9_r2_OO")
Murdoch<- c("MEL1_INS_GFP_MG3_r1_AE","MEL1_INS_GFP_MG4_r1_AE","hES3_Mixl_GFP_r1_AE","hES3_Mixl_GFP_r2_AE","RM3","RM31" )
Kyoto<- c( "Tig108_4f3_r1_HS","Tig108_4f3_r2_HS","iPS201B7_r1_HS", "iPS201B7_r2_HS","KhES.1_r1_HS","KhES.1_r2_HS" )
Wicell<- c("WA14_r1_JB","WA14_r2_JB","DF19.9.11T_4_r1_JB","DF19.9.11T_4_r2_JB","IMR90C4_r1_TEL","IMR90C4_r2_TEL")
# load the modified pluritest script
# functionality has been removed to allow R2.11 - R 3.x compatibility
# different output handling. Original is available in original workspace
source('pluritestCompared/pluritest_mod_file.R')
# load the data using patched version of original pluritest script.
# NormOnly provides only a normalized lumi object.
working.lumi1<-pluritest_mod("./working/forLumi.txt",NormOnly = TRUE)
# Add a simple Correction Factor to the Pluritest Modelmatrices
# This is very simple version of factor based batch effect removal as in RUV or SVA
M<-exprs(working.lumi1)
T<-exprs(H9targetArray)
T<-T[rownames(W15),]
M<-M[match(rownames(W15),rownames(M)),WA09]
shifttemp<-apply(M,1,mean)-T
shifttemp[is.na(shifttemp)] <- 0
W15<-cbind(W15[],shifttemp)
W12<-cbind(W12,shifttemp)
## Convert gene IDs to Illumina IDs
#pluridat <- read.table("~/Documents/GitHub/pluritest/project/data/RNA-seq_pluri.txt",
# header = TRUE)
## Load the expression counts and get corresponding array probes
## Match the Gene IDs to the Illumina probe IDs from the pluritest paper
ilmn_genes <- gprofiler2::gconvert(query = rownames(dat2),
target = "ILLUMINA_HUMANHT_12_V4")
ilmn_genes <- ilmn_genes[ilmn_genes$target %in% fData(H9targetArray)[,1], ]
dat_pluri <- dat2[rownames(dat2) %in% ilmn_genes$input, ]
dat_pluri <- cpm(dat_pluri, log = TRUE)
#dat_pluri <- voom(dat_pluri, normalize.method = "quantile")$E
ind <- match(ilmn_genes$input, rownames(dat_pluri))
dat_pluri <- dat_pluri[ind, ]
all(rownames(dat_pluri) == ilmn_genes$input)
rownames(dat_pluri) <- ilmn_genes$target
cpms <- dat_pluri
M <- cpms
Tr <- exprs(H9targetArray)
Tr <- Tr[rownames(W15), ]
M <- M[match(rownames(W15), rownames(M)), ]
shifttemp<-apply(M, 1, mean) - Tr
shifttemp[is.na(shifttemp)] <- 0
W15 <- cbind(W15[], shifttemp)
W12 <- cbind(W12, shifttemp)
### Code from https://github.com/pluritest/pluritestCompared/blob/master/pluritest_mod_file.R
#RSN NORMALIZATION OF THE DATA
A <- rownames(cpms) #matches on ILMN_Ids for lumi/RSN
B <- fData(H9targetArray)[,1] #for matches on ILMN_Ids for lumi/RSN
sel.match <- match(B,A)
sel <- match(rownames(W15), A)
working.lumi.int <- cpms[sel[!is.na(sel)], ]
coef <- c(-126.7095, 0.004567437, 0.004377068, 0.001043193)
H15.new <- predictH(working.lumi.int, W15[!is.na(sel), ])
H12.new <- predictH(working.lumi.int, W12[!is.na(sel), ])
rss.new <- apply((working.lumi.int - W12[!is.na(sel), ] %*% H12.new)^2, 2, sum)
RMSE.new <- sqrt(rss.new/sum(!is.na(sel)))
novel.new <- apply((working.lumi.int - W12[!is.na(sel), ] %*% H12.new)^8, 2, sum)
novel.new <- (novel.new/sum(!is.na(sel)))^(1/8)
s.new <- drop(coef[1] + coef[2:4] %*% H15.new[c(1, 14, 13), ])
table.results <- matrix(NA,nrow = ncol(working.lumi.int), ncol = 5)
rownames(table.results) <- colnames(working.lumi.int)
colnames(table.results) <- c("pluri-raw", "pluri logit-p", "novelty",
"novelty logit-p", "RMSD")
table.results[, 1] <- round(s.new, 3)
table.results[, 2] <- round(exp(s.new)/(1 + exp(s.new)),3)
table.results[, 3] <- round(novel.new, 3)
table.results[, 5] <- round(RMSE.new, 3)
pluri_df <- data.frame(table.results)
pluri_df$group <- y$samples$Group
```
```{r}
gg_pluri <- ggplot(data = pluri_df, aes(y = pluri.raw,
x = novelty, colour=group)) +
geom_point(size=2, alpha=0.8) + sams_pub_theme(legend_pos = "right", x.text.angle = 0,
hjust = 0.5) +
ylab("Pluripotency score") + xlab("Novelty score")
pdf("RNAseq/plots/pluritest_point.pdf", width = 3, height = 2)
gg_pluri
dev.off()
gg_nov <- ggplot(pluri_df[!pluri_df$group %in% c("Fibroblast", "Naive"), ], aes(x = group, y = novelty)) +
geom_point()
```