-
Notifications
You must be signed in to change notification settings - Fork 0
/
A.2023_05_29_Read10x.r
112 lines (94 loc) · 3.74 KB
/
A.2023_05_29_Read10x.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#Working on multi-batch, aggregated expression matrix for J1994 single cell data.
#Attempting to apply Batchelor to batch-correct and alter PCA reduction.
#Then will cluster and define cell types in aggregate.
#Prior approaches I attempted included (a) Azimuth alone applied in parallel to
# each sequencing sample, (b) Azimuth + manual annotations on each sequencing file.
#I sought to avoid analyzing aggregate matrix but will try now despite my attempt.
# 29 May 2023 AAG.
setwd("~/salvage_tmp/final/") #Parent directory of this code repository.
library(Seurat)
library(parallel)
library(dplyr)
#Load in aggregate data matrix. ---------------------------------------------
aggr = Read10X_h5(filename = './raw_data/scRNA/aggr/outs/count/filtered_feature_bc_matrix.h5')
seu = CreateSeuratObject(counts = aggr)
#I need to compare to files
meta = read.csv('./raw_data/scRNA/J1994_SC_run2_sampleIDs.csv')
meta$Sample.Name = gsub('-', '_', meta$Sample.Name)
sc_dirs_roots = c('AH_1GE_B9',
'AH_2GE_B10',
'AH_3GE_B11',
'AH_4GE_B12',
'AH_5GE_C1',
'AH_6GE_C2',
'AH_7GE_C3',
'AH_8GE_C4',
'C1D1GE',
'C2D1GE',
'C3D1GE',
'C6D1GE')
sc_dirs = paste0('./raw_data/scRNA//count/', sc_dirs_roots, '/filtered_feature_bc_matrix')
indiv_seurats = mclapply(sc_dirs, mc.cores = 5, FUN = function(x){
d = Read10X(x)
bcs = colnames(d)
return(bcs)
})
names(indiv_seurats) <- sc_dirs_roots
indiv_seurats <- lapply(indiv_seurats, FUN = function(x){
df = data.frame(Barcode = x)
return(df)
})
bc_key = bind_rows(indiv_seurats, .id = 'Sample')
bc_key$Batch = 1
bc_key$Batch[grep('AH_',bc_key$Sample)] = 2
idx = match(bc_key$Sample, meta$Sample.Name)
bc_key$Patient = meta$Subject.ID[idx]
bc_key$Patient[is.na(bc_key$Patient)] = 'J1994.001'
bc_key$Timepoint = meta$Timepoint[idx]
bc_key$Timepoint[bc_key$Sample %in% c('C2D1GE', 'C3D1GE', 'C6D1GE')] = 'Post'
bc_key$Timepoint[bc_key$Sample %in% 'C1D1GE'] = 'Pre'
bc_key$Barcode = substr(bc_key$Barcode,1,nchar(bc_key$Barcode) - 2)
idx = match(bc_key$Sample, sc_dirs_roots)
bc_key$Barcode = paste0(bc_key$Barcode, '-', idx)
bc_key$VDJ_beta = FALSE
bc_key$VDJ_alpha = FALSE
#For reference, also load in collected VDJ data from all samples. -----------
vdj_dirs = c('AH_1T_B1',
'AH_2T_B2',
'AH_3T_B3',
'AH_4T_B4',
'AH_5T_B5',
'AH_6T_B6',
'AH_7T_B7',
'AH_8T_B8',
'C1D1T',
'C2D1T',
'C3D1T',
'C6D1T'
)
vdj_dirs = paste0('./raw_data/scVDJ/', vdj_dirs, '.tsv')
vdj_data = mclapply(vdj_dirs, mc.cores = 5, FUN = function(x){
d = read.delim(x)
return(d)
})
names(vdj_data) = sc_dirs_roots
for(f in 1:12){ #Annotate which barcodes in single cell had beta chains in VDJ data.
vdjs = vdj_data[[f]]
beta_barcodes = unique(vdjs$cell_id[grep('TRBV', vdjs$v_call)])
beta_barcodes = substr(beta_barcodes, 1, nchar(beta_barcodes) - 2)
beta_barcodes = paste0(beta_barcodes, '-', f)
idx = match(beta_barcodes, bc_key$Barcode)
bc_key$VDJ_beta[idx] = TRUE
alpha_barcodes = unique(vdjs$cell_id[grep('TRAV', vdjs$v_call)])
alpha_barcodes = substr(alpha_barcodes, 1, nchar(alpha_barcodes) - 2)
alpha_barcodes = paste0(alpha_barcodes, '-', f)
idx = match(alpha_barcodes, bc_key$Barcode)
bc_key$VDJ_alpha[idx] = TRUE
}
write.csv(bc_key, file = paste0('./code_outputs/', Sys.Date(), '_aggregate-meta.csv'))
rownames(bc_key) = bc_key$Barcode
#Relate metadata to seurat object.
m = seu@meta.data
m = cbind(m, bc_key)
seu@meta.data = m
save(seu, file = paste0('./code_outputs/',Sys.Date(), '_AggrSeurat.rda'))