-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.nf
338 lines (233 loc) · 9.7 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
//!/usr/bin/env nextflow
nextflow.enable.dsl=2
def helpMessage() {
log.info"""
=========================================
Skeletalvis-Transcriptomics
=========================================
Usage:
The typical command for running the pipeline is as follows:
nextflow run main.nf -profile slurm -params-file params/<accessionNumber>.yaml -with-singularity library://jsoul/default/skeletalvis-pipeline -entry <setup,skeletalvis>
Required arguments:
-profile Configuration profile to use. <local, slurm>
-params-file Yaml file containing parameters for the analysis
--with-singularity Recommend to run the pipeline within the provided singularity container
-entry Workflow to run for inital setup or analysis
""".stripIndent()
}
params.help = false
if (params.help){
helpMessage()
exit 0
}
//define the parameters
//outdirectory for the results
params.outdir = "$baseDir/data/${params.accessionNumber}"
//site to download the sra/fastq files if needed
params.downloadSite = "ENA"
//accession number for ENA
params.ENA = ""
//directory with the fastqFiles
//ignoring R0 fastq files(usually discarded reads in trimming) by default
params.fastqFileDir = "$params.outdir/fastqFiles/*[!_R0].fastq.gz"
downloadFiles = file(params.fastqFileDir).isEmpty()
//make sure the species is lower case to match the downloaded reference data
params.species = "Human"
species = params.species.toLowerCase()
//define the ensembl release used for the fasta and the gtf file
params.ensemblRelease = "103"
params.referenceDir = "$baseDir/references"
//GTF file to generate the gene lengths
params.GTF = "$params.referenceDir/transcriptomes/${species}_release${params.ensemblRelease}.gtf"
//fasta file for generating the kallisto index
params.fasta = "$params.referenceDir/transcriptomes/${species}_release${params.ensemblRelease}.fa.gz"
//directory with the scripts
params.scriptDir = "$baseDir/scripts"
//directory with the fastqscreen genomes
params.fastq_screen_conf = "$params.referenceDir/fastqscreen/FastQ_Screen_Genomes/fastq_screen.conf"
//index for kallisto
params.indexName = "${params.species}_release${params.ensemblRelease}IndexFile"
params.index = "$params.referenceDir/$params.indexName"
//stranded library option for kallisto?
params.stranded = false
//should trimming be skipped?
params.skipTrimming = false
//sample table for the experiment
params.sampleTable = "$baseDir/params/sampleTables/${params.accessionNumber}_sampleTable.txt"
//comparison table for the experiment
params.comparisonsTable = "$baseDir/params/comparisons/${params.accessionNumber}_comparisons.txt"
//number of cpu cores to use for sra to fastq parsing. Check nextflow.config for local cpu executor usage.
params.cpuCores = 1
//does the differential expression analysis need supervised batch effect correction (uses svaseq in supervised mode)?
params.supervised = "FALSE"
//protein-protein interaction network files
params.biogrid = "$params.referenceDir/networks/${species}BioGridNetwork.txt"
params.string = "$params.referenceDir/networks/${species}StringNetwork.txt"
//enrichment libraries
params.TF_CHEA3_LIB = "$params.referenceDir/enrichment/TFs.RDS"
params.reactomePathways = "$params.referenceDir/enrichment/reactomePathways${params.species}.RDS"
//gene length files - will create from a provided GTF file if not provided
geneLengthsFile = "${species}GeneLengths.txt"
params.geneLengths= "$params.referenceDir/$geneLengthsFile"
//other species to human homology files for mapping genes to pathway databases
params.homology= "$params.referenceDir/homology/Homology.${species}.txt"
//thresholds to interate through to define differentially expressed genes for the downstream analysis
params.foldchange = [1,1.5,2]
params.foldChangeOnly = "TRUE"
if (params.foldChangeOnly=="FALSE"){
params.padj = [1,0.05,0.01]
} else {
params.padj = [1]
}
//microarray workflow specific parameters
params.localData = false
params.grepExpression = "FALSE"
params.grepString = ""
params.numberLinesSkip = 0
params.numberFileRemove = 0
params.split = "FALSE"
params.splitField = ""
params.splitSep = ""
params.splitPos = ""
params.remove = "FALSE"
params.removeSample = ""
params.offset = 80
params.setup = false
params.asperaKey = "$params.referenceDir/asperaweb_id_dsa.openssh"
//setup module
include { SETUP } from './modules/install/setup'
//Microarray modules
include { getGeneExpressionData } from './modules/microarray/getGeneExpressionData'
include { microarrayQC } from './modules/microarray/microarrayQC'
include { microarrayChrDir } from './modules/microarray/microarrayChrDir'
include { microarrayPCA } from './modules/microarray/microarrayPCA'
include { microarrayDiffExp } from './modules/microarray/microarrayDiffExp'
//RNA-seq modules
include { sraDownload } from './modules/RNASeq/sraDownload'
include { enaDownload } from './modules/RNASeq/enaDownload'
include { fastqToSampleID } from './modules/RNASeq/fastqToSampleID'
include { trimFastqFiles } from './modules/RNASeq/trimFastqFiles'
include { fastQC } from './modules/RNASeq/fastQC'
include { fastqScreen } from './modules/RNASeq/fastqScreen'
include { kallistoQuant } from './modules/RNASeq/kallistoQuant'
include { RNASeqPCA } from './modules/RNASeq/RNASeqPCA'
include { RNASeqChrDir } from './modules/RNASeq/RNASeqChrDir'
include { RNASeqDiffExp } from './modules/RNASeq/RNASeqDiffExp'
include { multiQC } from './modules/RNASeq/multiQC'
include { abundancesToTxi } from './modules/RNASeq/abundancesToTxi'
include { kallistoIndex } from './modules/RNASeq/kallistoIndex'
//Functional analysis modules
include { goEnrichment } from './modules/functionalAnalysis/goEnrichment'
include { tfEnrichment } from './modules/functionalAnalysis/tfEnrichment'
include { cutFoldChangeTables } from './modules/functionalAnalysis/cutFoldChangeTables'
include { pathwayEnrichment } from './modules/functionalAnalysis/pathwayEnrichment'
include { biogridNetworks } from './modules/functionalAnalysis/biogridNetworks'
include { stringNetworks } from './modules/functionalAnalysis/stringNetworks'
include { volcanoPlot } from './modules/functionalAnalysis/volcanoPlot'
include { makeGeneLengths } from './modules/functionalAnalysis/makeGeneLengths'
if(!params.setup){
//number of comparisons in the differential expression analysis
params.numComps=file(params.comparisonsTable).countLines()-1
numberComps= (1..params.numComps)
}
//set up the directory with the required data
workflow setup {
SETUP()
}
//main entry point for the transcriptomics analysis
workflow skeletalvis {
if(params.Type=="Microarray") {
MicroarrayData()
} else {
RNASeqData()
}
}
//subworkflow to perform pathway, GO and transcription factor analysis
workflow functionalAnalysis {
take: diffExpTable
geneLengthFile
padj
foldchange
main:
goEnrichment(diffExpTable,geneLengthFile,padj,foldchange)
pathwayEnrichment(diffExpTable,geneLengthFile,padj,foldchange)
tfEnrichment(diffExpTable)
biogridNetworks(diffExpTable)
stringNetworks(diffExpTable)
volcanoPlot(diffExpTable)
}
//subworkflow to download the fastq files from ENA/SRA if needed
workflow downloadRNASeqData {
main:
if(params.downloadSite=="ENA") {
fastqFiles = enaDownload()
} else {
fastqFiles = sraDownload()
}
emit:
fastqFiles
}
//workflow to perform RNA-seq analysis including differential expression and functional analysis
workflow RNASeqData {
if(downloadFiles){
rawData = downloadRNASeqData().fastqFiles.flatten()
} else {
rawData = channel.fromPath( params.fastqFileDir).view()
}
//group the data if paired reads
groupedReads = rawData.map { file ->
def key = file.name.toString().split("_R\\d*")[0]
return tuple(key, file)
}
.groupTuple(sort:true).view()
if(!file(params.geneLengths).exists()){
geneLengths = makeGeneLengths(geneLengthsFile)
} else {
geneLengths = file(params.geneLengths)
}
if(!file(params.index).exists()){
index = kallistoIndex(params.indexName,params.fasta)
} else {
index = file(params.index)
}
groupedSamples = fastqToSampleID(groupedReads,params.sampleTable)
.groupTuple()
.map{id,fastq -> tuple(id,fastq.flatten().sort())}.view()
if(params.skipTrimming){
trimmedSamples = groupedSamples
} else {
trimmedSamples = trimFastqFiles(groupedSamples)
}
fastQC(trimmedSamples)
fastqScreen(trimmedSamples)
trimmedSamples.view()
kallistoQuant(trimmedSamples,index)
multiQC(fastQC.out.stats.collect(),fastqScreen.out.stats.collect(),kallistoQuant.out.stats.collect())
abundanceMatrix = abundancesToTxi(kallistoQuant.out.abundances.collect())
RNASeqPCA(abundancesToTxi.out.txiData)
RNASeqChrDir(abundancesToTxi.out.txiData)
diffTables = RNASeqDiffExp(abundancesToTxi.out.txiData)
cutTables = cutFoldChangeTables(diffTables,numberComps)
functionalAnalysis(cutTables,geneLengths,params.padj,params.foldchange)
}
//workflow to perform microarray analysis including differential expression and functional analysis
workflow MicroarrayData {
if(!file(params.geneLengths).exists()){
geneLengths = makeGeneLengths(geneLengthsFile)
} else {
geneLengths = file(params.geneLengths)
}
if( params.localData ){
rawData = Channel
.fromPath("$baseDir/params/localData/${params.accessionNumber}.RDS")
.ifEmpty { exit 1, "local data not found: $baseDir/params/localData/${params.accessionNumber}.RData" }
} else {
rawData = getGeneExpressionData()
}
microarrayQC(rawData)
microarrayChrDir(rawData)
microarrayPCA(rawData)
diffTables = microarrayDiffExp(rawData)
cutTables = cutFoldChangeTables(diffTables,numberComps)
functionalAnalysis(cutTables,geneLengths,params.padj,params.foldchange)
}