-
Notifications
You must be signed in to change notification settings - Fork 1
/
lipidomics_data_generation_script.R
936 lines (792 loc) · 44.1 KB
/
lipidomics_data_generation_script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
### R-script for the LCMS data processing and quality control for the lipidomics of blood samples for the ADNI1 cohort.
## Author Dinesh Kumar Barupal dinkumar@ucdavis.edu West Coast Metabolomics Center, UC Davis USA
### This script has five steps. Follow instructions on each step to reproduce the results.
#Step 1. Extraction of peak heights from the extracted ion chromatogram files for targeted list of lipids.
#Step 2. LOESS normalization and batch effect removal.
#STEP 3. A rule based data Filtering to remove duplicate peaks
#STEP 4. Data visualization for the internal standards
#STEP 5: Combining POS and NEG mode data, additional filtering and data visualization
##################################################################################################################
### Step 1. Extraction of peak heights from the extracted ion chromatogram files for targeted list of lipids. ###
##################################################################################################################
# You will need a windows computer. The script was not tested on a linux machine.
# You need to have a basic understanding of R programming language to follow up the steps.
# Read the Nature Scientific Data Descriptor paper to get familiar with the data generationa and pipeline.
# Please download the "ADMC Lipidomics Extracted Ion Chromatograms" (adni1-lipidomics-eics.zip) file from ADNI repository.
# Make a new folder "ADNI1_lipidomics" in D: or E: drive in your computer. Do not use C: drive to avoid permission errors.
# Make two subfolders "ESIPOS_EIC" and "ESINEG_EIC" inside the ADNI1_lipidomics folder.
# Extract the adni1-lipidomics-eics.zip to the ADNI1_lipidomics folder. It has two sub-directories one for the positive mode and another for the negeative mode EICs.
# Copy the csv files from ESI pos folder to the "ESIPOS_EIC" and the files from ESI neg folder to the "ESINEG_EIC" folders.
# Download the "wcmc-lipidomics-database.zip" from the Sage Bionetwork (https://www.synapse.org/#!Synapse:syn10208654).
# Unzip the file. There are two txt files into it. Copy and paste the "esi_positive_mode_targets.txt" file into the "ESIPOS_EIC" folder and "esi_negative_mode_targets.txt" into the "ESINEG_EIC" folder.
# Download the "positive_mode_metadata.txt" and "negative_mode_metadata.txt" files from the Sage Bionetwork (Exact page location). These files contain the subject id to raw LCMS data file mapping and the time-stamp of the data acquisition. Copy these files into the ADNI1_lipidomics folder and into the "ESIPOS_EIC" and "ESINEG_EIC" folders.
# Download the duplicate_samples_ids.txt file from Synpase project page (https://www.synapse.org/#!Synapse:syn10208654).
# With this we are ready to proceeed.
# load Required R libraries for the step 1.
library(pacman) # This is the R package manager for easy installation and loading of the R-packages.
pacman::p_load(doSNOW, foreach,XLConnect)
options(java.parameters = "-Xmx4g") # Exporting Excel tables need more memory.
##################### Processing of the Positive mode data #######################
# In this step we will extract the peak height for each target in the esi_positive_mode_targets.txt file.
datadir <- "LOCATION of ESI Positive Mode CSV files" # provide the full location of the ESI positive mode EICs.
datadirfiles <- dir(datadir)
#csvfiles <- datadirfiles[grep("[.]csv$",datadirfiles)]
#### Import of Global Databases
postargetdb <- read.delim("esi_positive_mode_targets.txt", header = T, stringsAsFactors = F, sep="\t")
isddb.pos <- postargetdb[which(postargetdb$Type=="ISD"),]
isddb.pos <- isddb.pos[-which(isddb.pos$Cpd%in%c("ISD_TG","ISD_PE")==TRUE),]
targetdb.pos <- postargetdb[which(postargetdb$Type=="TargetCPD"),]
pos_metadata <- read.delim("positive_mode_metadata.txt", header = F, stringsAsFactors = F, sep="\t")
eiccsvfiles <- datadirfiles[grep("[.]CSV$",datadirfiles)]
cl<-makeCluster(6) ## number of cores goes here, for calculating faster.
#registerDoParallel(cl, cores = 6)
registerDoSNOW(cl)
foreach (i = 1:length(eiccsvfiles)) %dopar% {
con1 <- file(paste0(datadir,gsub("[.]CSV","",eiccsvfiles[i]),"_ISD_eics.txt"),"w")
con2 <- file(paste0(datadir,gsub("[.]CSV","",eiccsvfiles[i]),"_target_eics.txt"),"w")
isintendf <- readLines(paste0(datadir,"/",eiccsvfiles[i]),n=-1L)
isintendf <- c(isintendf,"+ESI EIC")
eicanchors <- grep("+ESI EIC",isintendf)
isdanchors.list <- lapply(1:(length(eicanchors)-1), function(x) {isintendf[eicanchors[x]:(eicanchors[x+1]-1)] })
targetnamevec <- sapply(isdanchors.list, function(x) { strsplit(x[1]," <|>:")[[1]][2]})
targetnamevec <- gsub("�","",targetnamevec)
### First get the Internal Standards Data from a raw LCMS data file.
idf <- isddb.pos
idf$samplert <- 1.0
idf$sampleid <- gsub("[.]CSV","",eiccsvfiles[i])
idf$deltart <- 0.0
idf$sampleintensity <- 0.0
for (k in 1:nrow(isddb.pos)) {
isrt <- idf$RT[k]
eicdf <- isdanchors.list[[which(targetnamevec==isddb.pos$Cpd[k])[1]]][-c(1:2)]
eicdf <- as.data.frame(do.call(rbind,lapply(eicdf,function(x) {strsplit(x,",")[[1]]})),stringsAsFactors = F)
eicdf$V2 <- as.numeric(eicdf$V2)
eicdf$V3 <- as.numeric(eicdf$V3)
rtrange <- which(eicdf$V2<(isrt+0.30) & eicdf$V2>(isrt-0.30)) #
isrt.sample <- eicdf$V2[rtrange[which.max(eicdf$V3[rtrange])]]
isinten.sample <- max(eicdf$V3[rtrange])
idf$samplert[k] <- isrt.sample
idf$deltart[k] <- isrt.sample-isrt
idf$sampleintensity[k] <- isinten.sample
}
write.table(idf,con1,col.names = T,row.names = F,quote = F, sep="\t")
## Correct the retention time using a polynomial linear model.
rtmodel <- lm(samplert ~ poly(RT,2), idf)
new.df <- data.frame(RT = targetdb.pos$RT) # predict the new RTs
newRtVec <- predict(rtmodel,new.df)
## Extract the peak heights for all the targets in the "esi_positive_mode_targets.txt" file.
tdf <- targetdb.pos
tdf$newRT <- newRtVec
tdf$samplert <- 1.0
tdf$sampleid <- gsub("[.]CSV","",eiccsvfiles[i])
tdf$deltart <- 0.0
tdf$sampleintensity <- 0.0
tdf$integration <- 1.0
for (k in 1:nrow(targetdb.pos)) {
isrt <- tdf$newRT[k]
eicdf <- isdanchors.list[[which(targetnamevec==targetdb.pos$Cpd[k])[1]]][-c(1:2)]
eicdf <- as.data.frame(do.call(rbind,lapply(eicdf,function(x) {strsplit(x,",")[[1]]})),stringsAsFactors = F)
eicdf$V2 <- as.numeric(eicdf$V2)
eicdf$V3 <- as.numeric(eicdf$V3)
rtrange <- which(eicdf$V2<(isrt+0.20) & eicdf$V2>(isrt-0.20))
isrt.sample <- eicdf$V2[rtrange[which.max(eicdf$V3[rtrange])]]
isinten.sample <- max(eicdf$V3[rtrange])
if(isinten.sample==0) {
rtrange <- which(eicdf$V2<(isrt+0.25) & eicdf$V2>(isrt-0.25)) # if the intensity of sample comes up as zero, we increases the threshold of the rt delta.
isrt.sample <- eicdf$V2[rtrange[which.max(eicdf$V3[rtrange])]]
isinten.sample <- max(eicdf$V3[rtrange])
is.inten.integration <- max(eicdf$V3[rtrange])/median(eicdf$V3[rtrange])
tdf$samplert[k] <- isrt.sample
tdf$deltart[k] <- isrt.sample-isrt
tdf$sampleintensity[k] <- isinten.sample
tdf$integration[k] <- is.inten.integration
} else {
is.inten.integration <- max(eicdf$V3[rtrange])/median(eicdf$V3[rtrange])
tdf$samplert[k] <- isrt.sample
tdf$deltart[k] <- isrt.sample-isrt
tdf$sampleintensity[k] <- isinten.sample
tdf$integration[k] <- is.inten.integration
}
}
write.table(tdf,con2,col.names = T,row.names = F,quote = F, sep="\t")
print(i)
close(con1)
}
stopCluster(cl)
closeAllConnections()
# Reading back the peak height values and combining them.
datadirfiles <- dir(datadir)
target_eic_files <- datadirfiles[grep("_target_eics.txt$",datadirfiles)]
pos_inten_list_delta <-lapply(1:length(target_eic_files), function(x) {
target_data <- read.delim(paste0(datadir,target_eic_files[x]), header = T, stringsAsFactors = F)
res <- data.frame(intensity=target_data$sampleintensity, rtdelta =target_data$deltart,integration = target_data$integration)
res
})
# Export the intensities values.
pos_inten_inten <- lapply(1:length(pos_inten_list_delta), function(x) {pos_inten_list_delta[[x]]$intensity })
pos_inten_inten <- as.data.frame(do.call(rbind, pos_inten_inten), stringsAsFactors = F)
pos_inten_inten <- round(pos_inten_inten,digits = 0)
pos_inten_inten <- rbind(targetdb.pos$Cpd,pos_inten_inten)
pos_inten_inten <- cbind(data.frame(filename=c("",gsub("_target_eics.txt","",target_eic_files))),pos_inten_inten)
pos_inten_inten$filename <- gsub("[.]2$","",pos_inten_inten$filename)
pos_inten_inten <- pos_inten_inten[- (which(pos_inten_inten$filename%in%pos_metadata$V1==FALSE)[-1]),]
tdf <- rbind(t(pos_inten_inten)[1,],as.character(sapply(pos_inten_inten$filename, function(x) { pos_metadata$V2[which(pos_metadata$V1==x)]})),as.character(sapply(pos_inten_inten$filename, function(x) { pos_metadata$V3[which(pos_metadata$V1==x)]})),t(pos_inten_inten)[-1,])
tdf <- tdf[,-1]
etdf <- cbind(as.character(c("","",pos_inten_inten[1,])),c("","","InChiKeys",rep("",length(pos_inten_inten[1,-1]))) ,c("","SampleLabel","TimeStamp",rep("",length(pos_inten_inten[1,-1]))),tdf )
write.table(etdf, file="etdf_positive_mode_target_intensities.txt",col.names = F, row.names = F, quote = F, sep="\t")
## export to excel directly.
exc <- XLConnect::loadWorkbook("adni-lipidomics-raw.xlsx", create = T) # this file will be used by the LOESS normalization script.
XLConnect::createSheet(exc,'sudo')
XLConnect::createSheet(exc,'POS')
XLConnect::writeWorksheet(exc, etdf, sheet = "POS", startRow = 1, startCol = 1,header = F)
#XLConnect::saveWorkbook(exc)
##################################################################################
##################### Processing of the Negetive mode data #######################
##################################################################################
# In this step we will extract the peak height for each target in the esi_negative_mode_targets.txt file. All the steps are same as for the ESI-positive mode data.
datadir <- "LOCATION OF NEGATIVE MODE DATA" # switch to the ESI neg EIC directory.
datadirfiles <- dir(datadir)
csvfiles <- datadirfiles[grep("[.]csv$",datadirfiles)]
#### Import of Global Databases
negtargetdb <- read.delim("esi_negative_mode_targets.txt", header = T, stringsAsFactors = F, sep="\t")
isddb.neg <- negtargetdb[which(negtargetdb$Type=="ISD"),]
targetdb.neg <- negtargetdb[which(negtargetdb$Type=="TargetCPD"),]
targetdb.neg$Cpd <- gsub("_a$|_b$","",targetdb.neg$Cpd)
neg_metadata <- read.delim("negative_mode_metadata.txt", header = F, stringsAsFactors = F, sep="\t")
eiccsvfiles <- datadirfiles[grep("[.]CSV$",datadirfiles)]
cl<-makeCluster(6) ## number of cores goes here, for calculating faster.
#registerDoParallel(cl, cores = 6)
registerDoSNOW(cl)
foreach (i = 1:length(eiccsvfiles)) %dopar% {
con1 <- file(paste0(datadir,gsub("[.]CSV","",eiccsvfiles[i]),"_ISD_eics.txt"),"w")
con2 <- file(paste0(datadir,gsub("[.]CSV","",eiccsvfiles[i]),"_target_eics.txt"),"w")
isintendf <- readLines(paste0(datadir,"/",eiccsvfiles[i]),n=-1L)
isintendf <- c(isintendf,"-ESI EIC")
eicanchors <- grep("-ESI EIC",isintendf)
isdanchors.list <- lapply(1:(length(eicanchors)-1), function(x) {isintendf[eicanchors[x]:(eicanchors[x+1]-1)] })
targetnamevec <- sapply(isdanchors.list, function(x) { strsplit(x[1]," <|>:")[[1]][2]})
targetnamevec <- gsub("�","",targetnamevec) ## some names has these weird chars. Clean this char in the targeted DB file.
### Get the Internal Standards Data
idf <- isddb.neg
idf$samplert <- 1.0
idf$sampleid <- gsub("[.]CSV","",eiccsvfiles[i])
idf$deltart <- 0.0
idf$sampleintensity <- 0.0
for (k in 1:nrow(isddb.neg)) {
isrt <- idf$RT[k]
eicdf <- isdanchors.list[[which(targetnamevec==isddb.neg$Cpd[k])[1]]][-c(1:2)]
eicdf <- as.data.frame(do.call(rbind,lapply(eicdf,function(x) {strsplit(x,",")[[1]]})),stringsAsFactors = F)
eicdf$V2 <- as.numeric(eicdf$V2)
eicdf$V3 <- as.numeric(eicdf$V3)
rtrange <- which(eicdf$V2<(isrt+0.30) & eicdf$V2>(isrt-0.30))
isrt.sample <- eicdf$V2[rtrange[which.max(eicdf$V3[rtrange])]]
isinten.sample <- max(eicdf$V3[rtrange])
idf$samplert[k] <- isrt.sample
idf$deltart[k] <- isrt.sample-isrt
idf$sampleintensity[k] <- isinten.sample
}
write.table(idf,con1,col.names = T,row.names = F,quote = F, sep="\t")
## Get the RT model
rtmodel <- lm(samplert ~ poly(RT,2), idf)
new.df <- data.frame(RT = targetdb.neg$RT) # predict the new RTs
newRtVec <- predict(rtmodel,new.df)
## Get the Targeted Lipid Data
tdf <- targetdb.neg
tdf$newRT <- newRtVec
tdf$samplert <- 1.0
tdf$sampleid <- gsub("[.]CSV","",eiccsvfiles[i])
tdf$deltart <- 0.0
tdf$sampleintensity <- 0.0
tdf$integration <- 1.0
for (k in 1:nrow(targetdb.neg)) {
#print(k)
isrt <- tdf$newRT[k]
eicdf <- isdanchors.list[[which(targetnamevec==targetdb.neg$Cpd[k])[1]]][-c(1:2)]
eicdf <- as.data.frame(do.call(rbind,lapply(eicdf,function(x) {strsplit(x,",")[[1]]})),stringsAsFactors = F)
eicdf$V2 <- as.numeric(eicdf$V2)
eicdf$V3 <- as.numeric(eicdf$V3)
rtrange <- which(eicdf$V2<(isrt+0.20) & eicdf$V2>(isrt-0.20))
isrt.sample <- eicdf$V2[rtrange[which.max(eicdf$V3[rtrange])]]
isinten.sample <- max(eicdf$V3[rtrange])
if(isinten.sample==0) {
rtrange <- which(eicdf$V2<(isrt+0.25) & eicdf$V2>(isrt-0.25)) #
isrt.sample <- eicdf$V2[rtrange[which.max(eicdf$V3[rtrange])]]
isinten.sample <- max(eicdf$V3[rtrange])
is.inten.integration <- max(eicdf$V3[rtrange])/median(eicdf$V3[rtrange])
tdf$samplert[k] <- isrt.sample
tdf$deltart[k] <- isrt.sample-isrt
tdf$sampleintensity[k] <- isinten.sample
tdf$integration[k] <- is.inten.integration
} else {
is.inten.integration <- max(eicdf$V3[rtrange])/median(eicdf$V3[rtrange])
tdf$samplert[k] <- isrt.sample
tdf$deltart[k] <- isrt.sample-isrt
tdf$sampleintensity[k] <- isinten.sample
tdf$integration[k] <- is.inten.integration
}
}
write.table(tdf,con2,col.names = T,row.names = F,quote = F, sep="\t")
#print(i)
close(con1)
}
closeAllConnections()
## Import the peak heights and combine them.
datadirfiles <- dir(datadir)
target_eic_files <- datadirfiles[grep("_target_eics.txt$",datadirfiles)]
neg_inten_list_delta <-lapply(1:length(target_eic_files), function(x) {
target_data <- read.delim(paste0(datadir,target_eic_files[x]), header = T, stringsAsFactors = F)
res <- data.frame(intensity=target_data$sampleintensity, rtdelta =target_data$deltart, integration=target_data$integration)
res
})
# Export the intensities values.
neg_inten_inten <- lapply(1:length(neg_inten_list_delta), function(x) {neg_inten_list_delta[[x]]$intensity })
neg_inten_inten <- as.data.frame(do.call(rbind, neg_inten_inten),stringsAsFactors = F)
neg_inten_inten <- round(neg_inten_inten,digits = 0)
## These files need to be in same folder.
negtargetdb <- read.delim("esi_negative_mode_targets.txt", header = T, stringsAsFactors = F, sep="\t")
isddb.neg <- negtargetdb[which(negtargetdb$Type=="ISD"),]
targetdb.neg <- negtargetdb[which(negtargetdb$Type=="TargetCPD"),]
neg_inten_inten <- rbind(targetdb.neg$Cpd,neg_inten_inten)
neg_inten_inten <- cbind(data.frame(filename=c("",gsub("_target_eics.txt","",target_eic_files))),neg_inten_inten)
neg_inten_inten$filename <- gsub("[.]2$","",neg_inten_inten$filename)
neg_inten_inten <- neg_inten_inten[- (which(neg_inten_inten$filename%in%neg_metadata$V1==FALSE)[-1]),]
tdf <- rbind(t(neg_inten_inten)[1,],as.character(sapply(neg_inten_inten$filename, function(x) { neg_metadata$V2[which(neg_metadata$V1==x)]})),as.character(sapply(neg_inten_inten$filename, function(x) { neg_metadata$V3[which(neg_metadata$V1==x)]})),t(neg_inten_inten)[-1,])
tdf <- tdf[,-1]
etdf <- cbind(as.character(c("","",neg_inten_inten[1,])),c("","","InChiKeys",rep("",length(neg_inten_inten[1,-1]))) ,c("","SampleLabel","TimeStamp",rep("",length(neg_inten_inten[1,-1]))),tdf )
write.table(etdf, file="etdf_negative_mode_target_intensities.txt",col.names = F, row.names = F, quote = F, sep="\t")
XLConnect::createSheet(exc,'NEG')
XLConnect::writeWorksheet(exc, etdf, sheet = "NEG", startRow = 1, startCol = 1,header = F)
XLConnect::saveWorkbook(exc)
#####################################################################
### Step 2. LOESS normalization and batch effect removal. ###
#####################################################################
# Copy the etdf_negative_mode_target_intensities.txt, etdf_positive_mode_target_intensities.txt and adni-lipidomics-raw.xlsx to the ADNI1_lipidomics folder.
# Run the following block of script for the loess normalization.
##########################################
### LOESS Normalization ##################
### Author Sili Fan slfan@ucdavis.edu ####
##########################################
# functions.
# ---- define batches ---- #
# Batches first defined as time intervals. Five batches defined. For each compound, if a t test shows a non-significant
# result between neighbor batches, then merge these two batches.
# Author Sili Fan, UCDavis May 2017
# BLOCK START - select from here to the line 521 and run.
DefineBatches = function(e,f,p,
time.intval.batch = TRUE, time = 'TimeStamp',number_of_batches=4,
batch = NULL,
QC.index = p$QC.index,
auto.merge.batch = T, p.adjust.method = 'none'){
# first define 5 batches according to time interval. We define 5 batches.
if(time.intval.batch){
time.diff = diff(p[[time]])
batch.index = which(time.diff%in%sort(time.diff,decreasing=T)[1:(number_of_batches-1)])
batch = rep(LETTERS[1:number_of_batches],times = diff(c(0,batch.index,nrow(p))))
}
# then if auto.merge.batch is true, for each compound, we use t test to merge two neighbor batches. Rule: if p > 0.05, we merge.
# this makes differnt compounds have different batch.
batches = matrix(NA,nrow=nrow(p),ncol=nrow(f))
if(auto.merge.batch){
for(i in 1:nrow(f)){
test=pairwise.wilcox.test(as.numeric(e[i,QC.index]), batch[QC.index],p.adjust.method = p.adjust.method)
test.p = diag(test$p.value)
batch.opt = LETTERS[1:(length(which(test.p<0.05))+1)]
batches[,i] = rep(batch.opt,times = diff(c(0,batch.index[which(test.p<0.05)],nrow(p))))
}
}else{
batches = matrix(rep(batch,ncol(batches)),ncol = ncol(batches))
}
return(t(batches))
}
get_loess_para = function(x,y,loess.span.limit = 0.5){ # use leave-one-out to select the best span parameter.
j = 0
error = rep(0, length(c(seq(loess.span.limit,1.5,0.1),1.75,2,2.25,2.5)))
if(class(x)=='numeric'|class(x)=='integer'){
for(par in c(seq(loess.span.limit,1.5,0.1),1.75,2,2.25,2.5)){
j = j+1
for(i in 2:(length(y)-1)){ # if i from 1 or to length(y), the prediction would be NA
o = loess(y[-i]~x[-i],span = par)
if(sum(is.na(o))){
error[j] = Inf
}else{
err = abs(predict(o,newdata = x[i]) - y[i])
error[j] = sum(error[j],err,na.rm = T)
}
}
}
}else if(class(x)=='data.frame'){
d = data.frame(y,x)
for(par in c(seq(loess.span.limit,1.5,0.1),1.75,2,2.25,2.5)){
j = j+1
for(i in 2:(length(y)-1)){ # if i from 1 or to length(y), the prediction would be NA
o = loess(y~.,d[-i,],span=par)
# o = loess(y[-i]~x[-i,],span = par)
if(sum(is.na(o))){
error[j] = Inf
}else{
err = abs(predict(o,newdata = d[i,colnames(x)]) - y[i])
error[j] = sum(error[j],err,na.rm = T)
}
}
}
}else{
stop("Error: The class of x should be either numeric or matrix")
}
return(c(seq(loess.span.limit,1.5,0.1),1.75,2,2.25,2.5)[which.min(error)])
}
remove_outlier = function(v){ # sometimes, the outlier of QC would be disaster when fitting loess. So we have to remove them.
out = boxplot(v,plot=F)$out
return(list(value = v[!v%in%out],index = which(v%in%out)))
}
shiftData = function(ori,norm){
# to make sure that after normalization the data still at the same scale as raw data.
ori.min = apply(ori,1,min,na.rm=T)
norm.min = apply(norm,1,min,na.rm=T)
return(norm - c(norm.min - ori.min))
}
loessNormalization = function(e,f,p,
batch = DefineBatches(e,f,p),
QC.index = p$QC.index,
time = "TimeStamp",
span.para = 'auto',loess.span.limit=0.5,
divide = F){
# batch is a matrix. columns are samples and rows are compounds.
# batch = define_batch(e=e,p=p,f=f)
# for each compound, calculate the loess line.
library(parallel)
cl = makeCluster(10)
e = data.matrix(e)
norms = parSapply(cl, X = 1:nrow(e), function(i,e,f,p,QC.index,batch,time,remove_outlier,span.para,get_loess_para,
loess.span.limit){
models = by(data.frame(v=e[i,QC.index],t=p[[time]][QC.index]),
batch[i,QC.index],function(x){
# x = data.frame(v=e[i,QC.index],t=p[[time]][QC.index])[batch[i,QC.index]=="B",]
if(length(remove_outlier(x$v)[[2]])>0){# if outlier exists.
span = ifelse(span.para=='auto',
get_loess_para(x=x$t[-remove_outlier(x$v)[[2]]],y=remove_outlier(x$v)[[1]],
loess.span.limit = loess.span.limit),span.para) # find a proper span.
}else{
span = ifelse(span.para=='auto',
get_loess_para(x=x$t,y=x$v,
loess.span.limit = loess.span.limit),span.para) # find a proper span.
}
if(length(remove_outlier(x$v)[[2]])>0){
loess(v~t,data=x[-remove_outlier(x$v)[[2]],],span=span)
}else{
loess(v~t,data=x,span=span)
}
})
# predict using the models.
norm = mapply(function(u,v){
o = tryCatch({
predict(u,newdata = v)
},
error = function(e){
print(e)
v
})
},models,by(p[[time]],batch[i,],function(x){x}))
norm = unlist(norm)
# replace NA with the closest value.
if(length(which(is.na(norm)))>0){
for(j in which(is.na(norm))){
time_notNA = p[[time]][-which(is.na(norm))]
closest_time = time_notNA[which.min(abs(time_notNA - p[[time]][j]))]
norm[j] = norm[which(p[[time]]==closest_time)]
}
}
return(norm)
},e,f,p,QC.index,batch,time,remove_outlier,span.para,get_loess_para,loess.span.limit)
norms = t(norms)
e_norm = matrix(NA,nrow=nrow(f),ncol=nrow(p))
if(divide){
for(i in 1:nrow(f)){
e_norm[i,] = e[i,] / (norms[i,] / median(e[i,]))
}
}else{
for(i in 1:nrow(f)){
e_norm[i,] = e[i,] - (norms[i,] - median(e[i,]))
}
}
rownames(e_norm) = rownames(e)
colnames(e_norm) = colnames(e)
stopCluster(cl)
return(list(e = shiftData(ori=e,norm = e_norm),f=f,p=p,normalize_line = norms))
}
# ---- read data ---- #
pacman::p_load(readxl, data.table)
# positive mode or negative mode
### Positive Mode Data
POSorNEG = "POS" ##
data = read_excel("adni-lipidomics-raw.xlsx",sheet = ifelse(POSorNEG=="POS",2,3))
f = data.table(data[3:nrow(data),c(1,2,3)]); colnames(f) = c("compound","RT","m.z")
p = data.table(t(data[c(1,2),])); colnames(p) = c("SampleLabel","TimeStamp"); p = p[-c(1,2,3)];
p$index = 1:nrow(p)
order = order(p$TimeStamp)
p$TimeStamp = as.numeric(p$TimeStamp)
p = p[order,]
p[,QC.index:=grepl("QC",SampleLabel)]
# define batches according to time.
timeInterval = which(diff(p[,TimeStamp])%in%sort(diff(p[,TimeStamp]),decreasing = T)[1:3])
batch = rep(LETTERS[1:4],diff(c(0,timeInterval,nrow(p))))
p[,batches:=batch]
e = as.matrix(data[3:nrow(data),4:ncol(data)]); e=apply(e,2,as.numeric)
e = e[,order]
batches=DefineBatches(e,f,p,auto.merge.batch = F)
p1 <- p[order(p$index),]
# loess normalization.
loess = loessNormalization(e,f,p) # model fitting
e_loess = loess$e # matrix of normalized data
p2 <- rbind(t(p1),e_loess[,order(p$index)])
f1 <- do.call(cbind,lapply(f,function(x) { c(rep("",(nrow(p2)-nrow(f))),x)})) ## adding some empty rows to the feature matrix.
p3 <- cbind(f1,p2)
# save normalized data.
fwrite(data.table(p3),paste0(POSorNEG,"_loessNormalized.csv"))
### Negative Modes Data
POSorNEG = "NEG" ##
data = read_excel("adni-lipidomics-raw.xlsx",sheet = ifelse(POSorNEG=="POS",2,3))
f = data.table(data[3:nrow(data),c(1,2,3)]); colnames(f) = c("compound","RT","m.z")
p = data.table(t(data[c(1,2),])); colnames(p) = c("SampleLabel","TimeStamp"); p = p[-c(1,2,3)];
p$index = 1:nrow(p)
order = order(p$TimeStamp)
p$TimeStamp = as.numeric(p$TimeStamp)
p = p[order,]
p[,QC.index:=grepl("QC",SampleLabel)]
# define batches according to time.
timeInterval = which(diff(p[,TimeStamp])%in%sort(diff(p[,TimeStamp]),decreasing = T)[1:3])
batch = rep(LETTERS[1:4],diff(c(0,timeInterval,nrow(p))))
p[,batches:=batch]
e = as.matrix(data[3:nrow(data),4:ncol(data)]); e=apply(e,2,as.numeric)
e = e[,order]
batches=DefineBatches(e,f,p,auto.merge.batch = F)
p1 <- p[order(p$index),]
# loess normalization.
loess = loessNormalization(e,f,p) # model fitting
e_loess = loess$e # matrix of normalized data
p2 <- rbind(t(p1),e_loess[,order(p$index)])
f1 <- do.call(cbind,lapply(f,function(x) { c(rep("",(nrow(p2)-nrow(f))),x)})) ## adding some empty rows to the feature matrix.
p3 <- cbind(f1,p2)
# save normalized data.
fwrite(data.table(p3),paste0(POSorNEG,"_loessNormalized.csv"))
# It has generated two files POS_loessNormalized.csv and NEG_loessNormalized.csv files.
# Block END
#####################################################################
### STEP 3. A rule based data Filtering to remove duplicate peaks ###
#####################################################################
# We will apply a series of rules to remove the poor quality signals and merge the ESI Pos and Neg mode data into one final data matrix.
###################
## Positive Mode ##
###################
postargetdb <- read.delim("esi_positive_mode_targets.txt", header = T, stringsAsFactors = F, sep="\t")
isddb.pos <- postargetdb[which(postargetdb$Type=="ISD"),]
isddb.pos <- isddb.pos[-which(isddb.pos$Cpd%in%c("ISD_TG","ISD_PE")==TRUE),]
targetdb.pos <- postargetdb[which(postargetdb$Type=="TargetCPD"),]
pos_metadata <- read.delim("positive_mode_metadata.txt", header = F, stringsAsFactors = F, sep="\t")
postdf <- read.csv("POS_loessNormalized.csv",header = T,stringsAsFactors = F)
lipidids <- postdf[(6:nrow(postdf)),1]
sampleids <- postdf[1,4:ncol(postdf)]
ikvec <- as.character(sapply(lipidids, function(x) { strsplit(x,"_")[[1]][2] }))
post.df.sb <- postdf[(6:nrow(postdf)),4:ncol(postdf)]
post.df.sb <- do.call(rbind,lapply(1:nrow(post.df.sb), function(x) { as.numeric(post.df.sb[x,])}))
post.df.sb <- round(as.data.frame(post.df.sb, stringsAsFactors = F),digits = 0)
post.df.qc <- post.df.sb[,grep("^QC",sampleids)]
## outliers length
outliervec <- lapply(1:nrow(post.df.qc), function(x) { boxplot.stats(as.numeric(post.df.qc[x,]),coef = 2)$out } )
## QC RSD calculation.
qcrsdvec <- sapply(1:nrow(post.df.qc), function(x) {
if(length(outliervec[[x]])>0 & length(outliervec[[x]]) < 9 ) {
medvec <- post.df.qc[x,-which(post.df.qc[x,]%in%outliervec[[x]]==TRUE)];
(sd(as.numeric(medvec))/mean(as.numeric(medvec)))*100
} else {
(sd(as.numeric(post.df.qc[x,]))/mean(as.numeric(post.df.qc[x,])))*100
}
})
## QC Median
qcmedianvec <- sapply(1:nrow(post.df.qc), function(x) { median(as.numeric(post.df.qc[x,])) })
## Sample Median
samplemedian <- sapply(1:nrow(post.df.qc), function(x) { median(as.numeric(post.df.sb[x,-grep("^QC",sampleids)])) } )
## 1000 Counts
sample1000counts <- sapply(1:nrow(post.df.qc), function(x) { length(which(as.numeric(post.df.sb[x,-grep("^QC",sampleids)])>=1000)) } )
includevec <- sapply(1:length(ikvec), function(x) {
len <- length(which(ikvec==ikvec[x]))
qrsd <- qcrsdvec[x]
qcmed <- qcmedianvec[x]
samed <- samplemedian[x]
c1000 <- sample1000counts[1]
include <- TRUE
q1 <- qcrsdvec[which(ikvec==ikvec[x])]
if(is.na(qrsd)) {
include <- FALSE
} else {
if ( qrsd > 25 ){ # QC RSD is over 25%, exclude it.
include =FALSE
} else {
if(len>1){
if(length(which(q1<25))==length(q1)){ # if all adducts have less than 20% CV then select the compounds with highest median value in QC
med1 <- qcmedianvec[which(ikvec==ikvec[x])]
if( which(ikvec==ikvec[x])[which.max(med1)]!=x){
include = FALSE
}
} else {
if(which(ikvec==ikvec[x])[which.min(q1)]!=x){
include=FALSE
}
}
}
}
}
include
})
includevec[which(qcmedianvec<2000)] <- FALSE
pos.export.df <- post.df.sb[includevec,]
pos.export.df <- cbind(lipidids[includevec], pos.export.df)
pos.export.df[,1] <- as.character(pos.export.df[,1])
names(pos.export.df) <- c("FullName",as.character(sampleids))
prtvec <- as.numeric(sapply(pos.export.df[,1], function(x) { postargetdb$RT[which(postargetdb$Cpd==x)[1]] }))
pmzvec <- as.numeric(sapply(pos.export.df[,1], function(x) { postargetdb$Mass[which(postargetdb$Cpd==x)[1]] }))
inchikeyvec <- as.character(sapply(pos.export.df[,1], function(x) { strsplit(x,"_")[[1]][2] }))
cpdmdf <- do.call(rbind,lapply(pos.export.df[,1], function(x) { strsplit(x," [[]|[]]")[[1]][c(1:2)]}))
pos.export.df.final <- cbind(LipidName=cpdmdf,MZ=pmzvec,RT=prtvec,InchiKey=inchikeyvec,pos.export.df)
write.table(pos.export.df.final,"adni_pos_lipidomics_loess_with_isd.txt", col.names = T,row.names = F, quote = F, sep="\t")
###################
## Negative mode ##
###################
#### Import of Global Databases
negtargetdb <- read.delim("esi_negative_mode_targets.txt", header = T, stringsAsFactors = F, sep="\t")
isddb.neg <- negtargetdb[which(negtargetdb$Type=="ISD"),]
targetdb.neg <- negtargetdb[which(negtargetdb$Type=="TargetCPD"),]
targetdb.neg$Cpd <- gsub("_a$|_b$","",targetdb.neg$Cpd)
neg_metadata <- read.delim("negative_mode_metadata.txt", header = F, stringsAsFactors = F, sep="\t")
negtdf <- read.csv("NEG_loessNormalized.csv",header = T,stringsAsFactors = F)
lipidids <- negtdf[(6:nrow(negtdf)),1]
sampleids <- negtdf[1,4:ncol(negtdf)]
notinposindex <- which(sampleids%in%c("B-422","QC044","QC023")==TRUE) ## These samples are not present in the positive modes.
sampleids <- sampleids[-notinposindex]
negtdf <- negtdf[,-notinposindex]
ikvec <- as.character(sapply(lipidids, function(x) { strsplit(x,"_")[[1]][2] }))
negt.df.sb <- negtdf[(6:nrow(negtdf)),4:ncol(negtdf)]
negt.df.sb <- do.call(rbind,lapply(1:nrow(negt.df.sb), function(x) { as.numeric(negt.df.sb[x,])}))
negt.df.sb <- round(as.data.frame(negt.df.sb, stringsAsFactors = F),digits = 0)
negt.df.qc <- negt.df.sb[,grep("^QC",sampleids)]
outliervec <- lapply(1:nrow(negt.df.qc), function(x) { boxplot.stats(as.numeric(negt.df.qc[x,]),coef = 2)$out } )
qcrsdvec <- sapply(1:nrow(negt.df.qc), function(x) {
if(length(outliervec[[x]])>0 & length(outliervec[[x]]) < 9 ) {
medvec <- negt.df.qc[x,-which(negt.df.qc[x,]%in%outliervec[[x]]==TRUE)];
(sd(as.numeric(medvec))/mean(as.numeric(medvec)))*100
} else {
(sd(as.numeric(negt.df.qc[x,]))/mean(as.numeric(negt.df.qc[x,])))*100
}
})
## QC Median
qcmedianvec <- sapply(1:nrow(negt.df.qc), function(x) { median(as.numeric(negt.df.qc[x,])) })
## Sample Median
samplemedian <- sapply(1:nrow(negt.df.qc), function(x) { median(as.numeric(negt.df.sb[x,-grep("^QC",sampleids)])) } )
## 1000 Counts
sample1000counts <- sapply(1:nrow(negt.df.qc), function(x) { length(which(as.numeric(negt.df.sb[x,-grep("^QC",sampleids)])>=1000)) } )
includevec <- sapply(1:length(ikvec), function(x) {
len <- length(which(ikvec==ikvec[x]))
qrsd <- qcrsdvec[x]
qcmed <- qcmedianvec[x]
samed <- samplemedian[x]
c1000 <- sample1000counts[1]
include <- TRUE
q1 <- qcrsdvec[which(ikvec==ikvec[x])]
if(is.na(qrsd)) {
include <- FALSE
} else {
if ( qrsd > 25 ){ # QC RSD is over 25%, exclude it.
include =FALSE
} else {
if(len>1){
if(length(which(q1<25))==length(q1)){ # if all adducts have less than 20% CV then select the compounds with highest median value in QC
med1 <- qcmedianvec[which(ikvec==ikvec[x])]
if( which(ikvec==ikvec[x])[which.max(med1)]!=x){
include = FALSE
}
} else {
if(which(ikvec==ikvec[x])[which.min(q1)]!=x){
include=FALSE
}
}
}
}
}
include
})
includevec[which(qcmedianvec<1000)] <- FALSE
neg.export.df <- negt.df.sb[includevec,]
neg.export.df <- cbind(lipidids[includevec], neg.export.df)
neg.export.df[,1] <- as.character(neg.export.df[,1])
names(neg.export.df) <- c("FullName",as.character(sampleids))
prtvec <- as.numeric(sapply(neg.export.df[,1], function(x) { negtargetdb$RT[which(negtargetdb$Cpd==x)[1]] }))
pmzvec <- as.numeric(sapply(neg.export.df[,1], function(x) { negtargetdb$Mass[which(negtargetdb$Cpd==x)[1]] }))
inchikeyvec <- as.character(sapply(neg.export.df[,1], function(x) { strsplit(x,"_")[[1]][2] }))
cpdmdf <- do.call(rbind,lapply(neg.export.df[,1], function(x) { strsplit(x," [[]|[]]")[[1]][c(1:2)]}))
neg.export.df.final <- cbind(LipidName=cpdmdf,MZ=pmzvec,RT=prtvec,InchiKey=inchikeyvec,neg.export.df)
write.table(neg.export.df.final,"adni_neg_lipidomics_loess_with_isd.txt", col.names = T,row.names = F, quote = F, sep="\t")
##############################################################
### STEP 4. Data visualization for the internal standards ###
###############################################################
# Positive mode
pos_raw <- read.delim("etdf_positive_mode_target_intensities.txt",header = T, stringsAsFactors = F)
pos_loess <- read.delim("adni_pos_lipidomics_loess_with_isd.txt", header = T, stringsAsFactors = F)
posisdvec <- grep("iSTD",pos_loess[,6])
## Only ISDs
pos_raw.isd <- pos_raw[sapply(posisdvec, function(x) { which(pos_raw$X==pos_loess[x,6])}),grep("QC",pos_raw[1,])]
pos_loess.isd <- pos_loess[posisdvec,grep("QC",names(pos_loess))]
pos_isd_rsd <- cbind(Name=pos_loess$LipidName.1[posisdvec], raw=round(sapply(1:nrow(pos_raw.isd), function(x) { sd(as.numeric(pos_raw.isd[x,]))/mean(as.numeric(pos_raw.isd[x,])) })*100,digits = 1), loess= round(sapply(1:nrow(pos_loess.isd), function(x) { sd(as.numeric(pos_loess.isd[x,]))/mean(as.numeric(pos_loess.isd[x,])) })*100,digits = 1) )
write.table(pos_isd_rsd,"adni_pos_isd_qc_rsd.txt",col.names = T, row.names = F,quote = F,sep="\t")
## All Compounds
pos_raw.isd <- pos_raw[sapply(posisdvec, function(x) { which(pos_raw$X==pos_loess[x,6])}),4:ncol(pos_raw)]
pos_loess.isd <- pos_loess[posisdvec,7:ncol(pos_loess)]
pos_isd_rsd <- cbind(Name=pos_loess$LipidName.1[posisdvec], raw=round(sapply(1:nrow(pos_raw.isd), function(x) { sd(as.numeric(pos_raw.isd[x,]))/mean(as.numeric(pos_raw.isd[x,])) })*100,digits = 1), loess= round(sapply(1:nrow(pos_loess.isd), function(x) { sd(as.numeric(pos_loess.isd[x,]))/mean(as.numeric(pos_loess.isd[x,])) })*100,digits = 1) )
write.table(pos_isd_rsd,"adni_pos_isd_all_rsd.txt",col.names = T, row.names = F,quote = F,sep="\t") ## RSD of ISD in all the samples.
# negative mode
neg_raw <- read.delim("etdf_negative_mode_target_intensities.txt",header = T, stringsAsFactors = F)
neg_loess <- read.delim("adni_neg_lipidomics_loess_with_isd.txt", header = T, stringsAsFactors = F)
negisdvec <- grep("iSTD",neg_loess[,6])
neg_raw.isd <- neg_raw[sapply(negisdvec, function(x) { which(neg_raw$X==neg_loess[x,6])}),grep("QC",neg_raw[1,])]
neg_loess.isd <- neg_loess[negisdvec,grep("QC",names(neg_loess))]
neg_isd_rsd <- cbind(Name=gsub("iSTD iSTD ","",neg_loess$LipidName.1[negisdvec]), raw=round(sapply(1:nrow(neg_raw.isd), function(x) { sd(as.numeric(neg_raw.isd[x,]))/mean(as.numeric(neg_raw.isd[x,])) })*100,digits = 1), loess= round(sapply(1:nrow(neg_loess.isd), function(x) { sd(as.numeric(neg_loess.isd[x,]))/mean(as.numeric(neg_loess.isd[x,])) })*100,digits = 1) )
write.table(neg_isd_rsd,"adni_neg_isd_qc_rsd.txt",col.names = T, row.names = F,quote = F,sep="\t")
neg_raw.isd <- neg_raw[sapply(negisdvec, function(x) { which(neg_raw$X==neg_loess[x,6])}),4:ncol(neg_raw)]
neg_loess.isd <- neg_loess[negisdvec,7:ncol(neg_loess)]
neg_isd_rsd <- cbind(Name=gsub("iSTD iSTD ","",neg_loess$LipidName.1[negisdvec]), raw=round(sapply(1:nrow(neg_raw.isd), function(x) { sd(as.numeric(neg_raw.isd[x,]))/mean(as.numeric(neg_raw.isd[x,])) })*100,digits = 1), loess= round(sapply(1:nrow(neg_loess.isd), function(x) { sd(as.numeric(neg_loess.isd[x,]))/mean(as.numeric(neg_loess.isd[x,])) })*100,digits = 1) )
write.table(neg_isd_rsd,"adni_neg_isd_all_rsd.txt",col.names = T, row.names = F,quote = F,sep="\t")
#################################################################
##### STEP 5: Combining POS and NEG, additional filtering and data visualization####
#################################################################
if(length(table(names(neg_loess)==names(neg_loess)))==1) {print("Positive and Negative mode data have same order for lipids")}
pn_merged.df <- rbind(cbind(ESIMode="POS",pos_loess), cbind(ESIMode="NEG",neg_loess))
pn_merged.df <- pn_merged.df[-grep("iSTD",pn_merged.df$FullName),] ## remove the internal standards
qcindex <- grep("^QC",names(pn_merged.df))
cpdnamevec <- pn_merged.df$LipidName.1
cpd.qc.rsdvec <- sapply(1:nrow(pn_merged.df), function(x) { (sd(as.numeric(pn_merged.df[x,qcindex]))/mean(as.numeric(pn_merged.df[x,qcindex])))*100 })
dup.names <- sapply(1:length(cpdnamevec), function(x) {
pn_merged.df$FullName[which(cpdnamevec==cpdnamevec[x])]
})
includevec <- sapply(1:length(cpdnamevec), function(x) {
len <- length(which(cpdnamevec==cpdnamevec[x]))
include <- TRUE
if(len>1){
q1 <- cpd.qc.rsdvec[which(cpdnamevec==cpdnamevec[x])]
if(which(cpdnamevec==cpdnamevec[x])[which.min(q1)]!=x){
include=FALSE
}
}
include
})
pn_merged.df.filt <- pn_merged.df[-which(includevec==FALSE),]
write.table(pn_merged.df.filt,"adni_lipidomics_normalized_merged.txt", col.names = T, row.names = F, quote = F, sep = "\t")
######################
##### Final Export ###
######################
lipidnames <- paste0("UCD.Lipid.",c(1:nrow(pn_merged.df.filt)))
datadict.df <- cbind(lipidnames,pn_merged.df.filt[,1:7])
datadict.df$Type <- "Known"
datadict.df$Type[grep("^CSH",datadict.df$FullName)] <- "UnKnown"
datamat <- cbind(lipidnames,pn_merged.df.filt[,-c(1:7)])
#exclude sample
excludesamples <- c("B.280","B.305","B.538","B.289","B.670")
datamat <- datamat[,-which(names(datamat)%in%excludesamples==TRUE)]
# QC RSDs
datamat.rsdvec <- round(sapply(1:nrow(datamat), function(x) { sd(datamat[x,grep("QC",colnames(datamat))])/mean(as.numeric(datamat[x,grep("QC",colnames(datamat))]) ) })*100, digits = 1)
datadict.df$RSD <- datamat.rsdvec
names(datadict.df) <- c("LipidID","ESIMode","LipidName","Adduct","M/Z","RT","InChiKey","FullName","Type","RSD in QCs")
write.table(datadict.df,"adni-lipidomics-data-dict.csv",col.names = T, row.names = F, quote = F,sep = ",")
write.table(datamat,"adni1-lipidomics.csv",col.names = T, row.names = F, quote = F,sep = ",")
write.table(datamat,"adni_lipidomics_final_submit.txt",col.names = T, row.names = F, quote = F,sep = "\t")
####################################
### RSD calculation all compounds ##
####################################
datamat <- read.delim("adni_lipidomics_final_submit.txt", header = T, stringsAsFactors = F)
qcindex <- grep("QC",names(datamat))
post.df.qc <- datamat[,qcindex]
## outliers length
outliervec <- lapply(1:nrow(datamat), function(x) { boxplot.stats(as.numeric(datamat[x,qcindex]),coef = 2)$out } )
qcrsdvec <- sapply(1:nrow(datamat), function(x) {
if(length(outliervec[[x]])>0 & length(outliervec[[x]]) < 9 ) {
medvec <- post.df.qc[x,-which(post.df.qc[x,]%in%outliervec[[x]]==TRUE)];
(sd(as.numeric(medvec))/mean(as.numeric(medvec)))*100
} else {
(sd(as.numeric(post.df.qc[x,]))/mean(as.numeric(post.df.qc[x,])))*100
}
})
write.table(cbind(datamat[,1],qcrsdvec), file="qc_rsd_targets.txt", col.names = F, row.names = F, sep="\t", quote=F)
#########################################
### Principal Component Analysis ########
#########################################
stat_file <- readChar("adni_lipidomics_final_submit.txt", nchars = 100000000)
library(pacman)
pacman::p_load(mixOmics,ggplot2, ReporteRs,svglite)
stat_file <- gsub("\r","",stat_file)
cfile <- strsplit(stat_file,"\n")[[1]]
df1 <- as.data.frame(do.call(rbind, lapply(cfile, function (x) { strsplit(x,"\t")[[1]] } )),stringsAsFactors = F)
samplenames <- df1[1,][-1]
sampletype <- rep("Sample",length(samplenames))
sampletype[grep("QC",samplenames)] <- "QC"
cpdnames <- df1[,1][-1]
df2 <- df1[2:nrow(df1),2:ncol(df1)]
pcamat <- do.call("cbind",lapply(df2,as.numeric))
pcamat[which(is.na(pcamat)==TRUE)] <- min(pcamat[which(is.na(pcamat)==FALSE)])
pcamat[which(is.na(pcamat)==TRUE)] <- 100
pcamat[which(pcamat==0)] <- 100 ## zero need to replaced with some number.
pcamat <- round(log(pcamat),digits = 5)
pca.res <- mixOmics::pca(log(pcamat), ncomp=2,max.iter=100,center = T, scale = F)
pc_scores <- pca.res$loadings[[1]]
data_bw <- data.frame(snames = as.character(samplenames), stype=sampletype, PC1 = pc_scores[,1], PC2=pc_scores[,2], stringsAsFactors = F)
data_bw$PC1 <- as.numeric(pc_scores[,1])
data_bw$PC2 <- as.numeric(pc_scores[,2])
f2 <- ggplot(data_bw, aes(PC2,PC1, label=snames)) +
scale_y_continuous(paste("PC2 - variance explained : ",signif(pca.res$explained_variance[2],digit=2)*100," % ",sep="")) +
scale_x_continuous(paste("PC1 - variance explained : ",signif(pca.res$explained_variance[1],digit=2)*100," % ",sep="")) +
theme_bw() +
labs(title = "Principal component analysis (PCA)") +
theme(
plot.title = element_text(face="bold", size=20),
axis.title.x = element_text(face="bold", size=20),
axis.title.y = element_text(face="bold", size=20, angle=90),
panel.grid.major = element_blank(), # switch off major gridlines
panel.grid.minor = element_blank(), # switch off minor gridlines
panel.background = element_rect(fill = "transparent",colour = NA), # or theme_blank()
plot.background = element_rect(fill = "transparent",colour = NA),
legend.position = c(0.9,0.9), # manually position the legend (numbers being from 0,0 at bottom left of whole plot to 1,1 at top right)
legend.title = element_blank(), # switch off the legend title
legend.text = element_text(size=20),
legend.key.size = unit(1.5, "lines"),
legend.key = element_blank(), # switch off the rectangle around symbols in the legend
legend.spacing = unit(.05, "cm"),
axis.text.x = element_text(size=10,angle = 90, hjust = 1),
axis.text.y = element_text(size=10,angle = 0, hjust = 1)
)
f3 <- f2 + geom_point(aes(colour=stype), size=5) #+ geom_text(aes(label=snames),size=5,hjust=0, vjust=0)
f3
#f4 <- f3 + stat_ellipse( aes_string(colour=colnames(data_bw)[2] ), linetype = 1 )
#plot(f4)
wbp <- pptx(template = "./data/aggieplot_template_v1.pptx")
wbp <- addSlide( wbp, "lipidClust" )
wbp <- addPlot( wbp, function( ) print( f3 ), offx =.1 , offy = .1, width = 10, height = 7 , vector.graphic = TRUE )
wbp <- addParagraph( wbp, value = c("Principal component analysis shows that there is a clear difference in the two groups compared.") )
writeDoc( wbp, file = paste0("adni_lipidomics_pca.pptx") )
### Paired ttest analysis of replicated samples
### Duplicate Samples Data Matrix.###
datamat <- read.delim("adni_lipidomics_final_submit.txt", header = T, stringsAsFactors = F)
dupids <- read.delim("duplicate_samples_ids.txt", header = T, stringsAsFactors = F)
dupids$SampleLabel <- gsub("-",".",dupids$SampleLabel)
dupdf <- as.data.frame(do.call(cbind,lapply(dupids$SampleLabel, function(x) { datamat[,which(colnames(datamat)==x)]})),stringsAsFactors =F )
dupdf <- rbind(dupids$Pairs,dupdf)
names(dupdf) <- dupids$SampleLabel
dupdf <- cbind(SampleLabel=c("Pairs",as.character(datamat[,1])), dupdf)
write.table(dupdf, "ADNI_duplicate_samples.txt",col.names = T, row.names = F, quote = F,sep="\t")
dupdf <- read.delim("ADNI_duplicate_samples.txt", header = F, stringsAsFactors = F)
dupdf.mdata <- dupdf[1:2,]
dupdf.mdata <- dupdf.mdata[,-1]
dupdf <- dupdf[-c(1:2),]
lipidnames <- dupdf[,1]
dupdf <- dupdf[,-1]
pairdf <- sapply(names(table(as.numeric(dupdf.mdata[2,]))), function(x) { which(dupdf.mdata[2,]==x) })
ttest.pvals <- sapply( 1:nrow(dupdf), function(x) {
res <- t.test(as.numeric(dupdf[x,pairdf[1,]]),as.numeric(dupdf[x,pairdf[2,]]), paired = T)
res$p.value
})
write.table(cbind(lipidnames,ttest.pvals), file="pairedpvalues.txt", col.names = F, row.names = F, sep="\t", quote=F)