-
Notifications
You must be signed in to change notification settings - Fork 0
/
runWorkflow.py
146 lines (97 loc) · 8.44 KB
/
runWorkflow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from CTDopts.CTDopts import _InFile, CTDModel, args_from_file
import sys
import os
import subprocess
import re
pattern = re.compile('Q\w{4}[0-9]{3}[a-zA-Z]\w')
wf_dir = sys.argv[1]
ctd_params = args_from_file(wf_dir + '/WORKFLOW-CTD')
ctd_files = args_from_file(wf_dir + '/IN-FILESTOSTAGE')
r_scripts = '/lustre_cfc/software/qbic/openms/openMS2.0-immuno-src/share/OpenMS/SCRIPTS/{s}'
fasta_decoy_path = '/lustre_cfc/qbic/reference_genomes/Homo_sapiens/Proteome/swissprotHUMANwoi_130927.revCat.fasta'
data_path = '%s/data/' % wf_dir
result_path = '%s/result/' % wf_dir
log_path = '%s/logs/' % wf_dir
mzmlFiles = []
for filePath in ctd_files['Mass Spectrometry Data']:
fileName = filePath.split('/')[-1]
mzmlFiles.append('%s%s' % (data_path, fileName))
allele_fileName = ctd_files['MHC Alleles'].split('/')[-1]
allelePath = '%s%s' % (data_path, allele_fileName)
logfilename = 'ligandomicsQC_1_0_workflow.logs'
logfile = open(logfilename, 'w')
param_file = '%s/src/comet_MHCI_2015024.params' % wf_dir
# QC
# mzml processing
for mzml in mzmlFiles:
if mzml.endswith('.gz'):
logfile.write("Extracting gzipped content... \n")
cmd = "gzip -d {f}".format(f=mzml)
os.system(cmd)
mzml = mzml.replace('.gz', '')
idPath = mzml.replace('mzML', 'idXML')
identifier = mzml.split('/')[-1].split('.')[0]
if ctd_params['centroided'] == 'false':
pickpeakcommand = 'PeakPickerHiRes -in {i} -out {o} -algorithm:ms_levels 1'
subprocess.call(pickpeakcommand.format(i=mzml, o=mzml).split(),stderr=logfile, stdout=logfile)
ffcom = 'FileFilter -in {i} -out {o} -peak_options:remove_chromatograms -peak_options:indexed_file true'
subprocess.call(ffcom.format(i=mzml, o=mzml).split(),stderr=logfile, stdout=logfile)
commandComet = 'comet.2015024.linux.exe -P{p} {s}'.format(p=param_file, s=mzml)
subprocess.call(commandComet.split(),stderr=logfile, stdout=logfile)
commandFileConverter = 'IDFileConverter -in {f} -out {o} {a}'
annotation = '-mz_file {f} -add_ionmatch_annotation'
subprocess.call(commandFileConverter.format(f=mzml.replace('mzML','pep.xml'), o=idPath, a='').split(),stderr=logfile, stdout=logfile)
subprocess.call(commandFileConverter.format(f=idPath, o=idPath, a=annotation.format(f=mzml)).split(),stderr=logfile, stdout=logfile)
peptideIndexer = 'PeptideIndexer -in {f} -out {o} -fasta {d} -decoy_string XXX -prefix -enzyme:specificity none'.format(f=idPath, o=idPath, d=fasta_decoy_path)
subprocess.call(peptideIndexer.split(),stderr=logfile, stdout=logfile)
### predict hits of fitting length and calc FDR and PEP
#FDR calc
falseDiscovery = 'FalseDiscoveryRate -in {f} -out {o} -algorithm:add_decoy_peptides'.format(f=idPath,o=idPath)
subprocess.call(falseDiscovery.split(),stderr=logfile, stdout=logfile)
postErrorProb = 'IDPosteriorErrorProbability -in {f} -out {o}'.format(f=idPath, o=idPath)
subprocess.call(postErrorProb.split(),stderr=logfile, stdout=logfile)
#predict
peptideHit = 'python peptidehit_predict_netMHC.py -c {c} -in {f} -out {o} -allele {a}'.format(f=idPath,o=idPath,a=allelePath,c=ctd_params['MHC class'])
subprocess.call(peptideHit.split(),stderr=logfile, stdout=logfile)
#QC creation
imageCreator = 'ImageCreator -in {f} -out {o} -width 640 -height 640'.format(f=mzml,o=mzml.replace('mzML', 'png'))
subprocess.call(imageCreator.split(),stderr=logfile, stdout=logfile)
#Feature Finding
featureFinder = 'FeatureFinderCentroided -in {f} -out {o} -ini {i}'.format(f=mzml,o=mzml.replace('mzML', 'featureXML'),i='FeatureFinderCentroided.ini')
subprocess.call(featureFinder.split(),stderr=logfile, stdout=logfile)
#fork FDR filtered file
filterfile = 'IDFilter -in {f} -out {o} -score:pep 0.05 -remove_decoys'
subprocess.call(filterfile.format(f=idPath,o=idPath.replace('idXML', 'filtered.idXML')).split(),stderr=logfile, stdout=logfile)
#QCCalculator
qcresult = os.path.join(result_path, mzml.replace('mzML', 'qcML').split('/')[-1])
qccalc = 'QCCalculator -in {f} -feature {ff} -id {i} -out {o}'.format(f=mzml, ff=mzml.replace('mzML', 'featureXML'), i=idPath.replace('idXML', 'filtered.idXML'), o=qcresult)
subprocess.call(qccalc.split(),stderr=logfile, stdout=logfile)
qcembedd = 'QCEmbedder -in {f} -out {o} -plot {p} -qp_att_acc {qp} -cv_acc {cv}'
#QCEmbedder
subprocess.call(qcembedd.format(f=qcresult, o=qcresult, p=mzml.replace('mzML', 'png'), qp='QC:0000004', cv='QC:0000055').split(),stderr=logfile, stdout=logfile)
qextract = 'QCExtractor -qp {qp} -out_csv {o} -in {i}'
subprocess.call(qextract.format(qp='QC:0000022' , o=data_path + identifier + '_qc22.csv', i=qcresult).split(), stderr=logfile, stdout=logfile)
r_command = 'Rscript --vanilla {s} {i} {o}'
subprocess.call(r_command.format(s=r_scripts.format(s='ProduceQCFigures_tic.R'), i=data_path + identifier + '_qc22.csv', o=data_path + identifier + 'tic.png').split(), stderr=logfile, stdout=logfile)
subprocess.call(qcembedd.format(qp='QC:0000023', cv='MS:1000235', f=qcresult, o=qcresult, p=data_path + identifier + '_tic.png').split(), stderr=logfile, stdout=logfile)
subprocess.call(qextract.format(qp='QC:0000038', o=data_path + identifier + '_qc38.csv', i=qcresult).split(),stderr=logfile, stdout=logfile)
subprocess.call(r_command.format(s=r_scripts.format(s='ProduceQCFigures_acc.R'), i=data_path + identifier + '_qc38.csv', o=data_path + identifier +'_acc.png').split(), stderr=logfile, stdout=logfile)
subprocess.call(qcembedd.format(qp='QC:0000041', cv='QC:0000053', f=qcresult, o=qcresult, p=data_path + identifier + '_acc.png').split(), stderr=logfile, stdout=logfile)
subprocess.call(r_command.format(s=r_scripts.format(s='ProduceQCFigures_rt_acc.R'), i=data_path + identifier +'_qc38.csv', o=data_path + identifier + '_acc_rt.png').split(), stderr=logfile, stdout=logfile)
subprocess.call(qcembedd.format(qp='QC:0000041', cv='QC:0000054', f=qcresult, o=qcresult, p=data_path + identifier + '_acc_rt.png').split(), stderr=logfile, stdout=logfile)
subprocess.call(qextract.format(qp='QC:0000018', o=data_path + identifier + '_qc18.csv', i=qcresult).split(), stderr=logfile, stdout=logfile)
subprocess.call(r_command.format(s=r_scripts.format(s='ProduceQCFigures_inj.R'), i=data_path + identifier + '_qc18.csv', o=data_path + identifier + '_inj.png').split(), stderr=logfile, stdout=logfile)
subprocess.call(qcembedd.format(qp='QC:0000023', cv='QC:0000051', f=qcresult, o=qcresult, p=data_path + identifier + '_inj.png').split(), stderr=logfile, stdout=logfile)
subprocess.call(qextract.format(qp='QC:0000044', o=data_path + identifier + '_qc44.csv', i=qcresult).split(), stderr=logfile, stdout=logfile)
subprocess.call(r_command.format(s=r_scripts.format(s='ProduceQCFigures_idmap.R'), i=data_path + identifier + '_qc44.csv' + ' ' + data_path + identifier + '_qc38.csv', o=data_path + identifier + '_idmap.png').split(), stderr=logfile, stdout=logfile)
subprocess.call(qcembedd.format(qp='QC:0000035', cv='QC:0000052', f=qcresult, o=qcresult, p=data_path + identifier + '_idmap.png').split(), stderr=logfile, stdout=logfile)
subprocess.call('python SEperformance.py -in {i} -out {o}'.format(i=idPath, o=data_path + identifier + '_performance.tsv').split(), stderr=logfile, stdout=logfile)
subprocess.call(r_command.format(s='ProduceQCFigures_allelecounts.R', i=data_path + identifier + '_performance.tsv', o=data_path + identifier + '_allelecounts.png').split(), stderr=logfile, stdout=logfile)
subprocess.call(qcembedd.format(qp='QC:0000023', cv='QC:0000051', f=qcresult, o=qcresult, p=data_path + identifier + '_allelecounts.png').split(), stderr=logfile, stdout=logfile)
subprocess.call(r_command.format(s='ProduceQCFigures_bindercounts.R', i=data_path + identifier + '_performance.tsv', o=data_path + identifier + '_bindercounts.png').split(), stderr=logfile, stdout=logfile)
subprocess.call(qcembedd.format(qp='QC:0000023', cv='QC:0000051', f=qcresult, o=qcresult, p=data_path + identifier + '_bindercounts.png').split(), stderr=logfile, stdout=logfile)
r_command = 'Rscript --vanilla {s} {i} {o} {l}'
subprocess.call(r_command.format(s='FDRoptimizer.R', i=data_path + identifier + '_performance.tsv', o=data_path + identifier + '_fdroptimizer.png', l=data_path + identifier + '_temp.csv').split(), stderr=logfile, stdout=logfile)
subprocess.call(qcembedd.format(qp='QC:0000023', cv='QC:0000051', f=qcresult, o=qcresult, p=data_path + identifier + '_fdroptimizer.png').split(), stderr=logfile, stdout=logfile)
logfile.close()
subprocess.call(["mv", logfilename, log_path])