-
Notifications
You must be signed in to change notification settings - Fork 0
/
bwamem.xml
424 lines (351 loc) · 20.9 KB
/
bwamem.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
<?xml version="1.0"?>
<tool id="bwa_mem" name="BWA-MEM" version="0.1">
<requirements>
<container type="docker">pcap_tools</container>
</requirements>
<description>- map medium and long reads (> 100 bp) against reference genome</description>
<command>
#set $reference_fasta_filename = "localref.fa"
#if str( $reference_source.reference_source_selector ) == "history":
ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" &&
## The following shell commands decide with of the BWA indexing algorithms (IS or BWTSW) will be run
## depending ob the size of the input FASTA dataset
(
size=`stat -c %s "${reference_fasta_filename}" 2>/dev/null`; ## Linux
if [ $? -eq 0 ];
then
if [ \$size -lt 2000000000 ];
then
bwa index -a is "${reference_fasta_filename}";
echo "Generating BWA index with is algorithm";
else
bwa index -a bwtsw "${reference_fasta_filename}";
echo "Generating BWA index with bwtsw algorithm";
fi;
fi;
eval \$(stat -s "${reference_fasta_filename}"); ## OSX
if [ $? -eq 0 ];
then
if [ \$st_size -lt 2000000000 ];
then
bwa index -a is "${reference_fasta_filename}";
echo "Generating BWA index with is algorithm";
else
bwa index -a bwtsw "${reference_fasta_filename}";
echo "Generating BWA index with bwtsw algorithm";
fi;
fi;
) &&
#else:
#set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )
#end if
## Begin BWA-MEM command line
bwa mem
-t "\${GALAXY_SLOTS:-1}"
-v 1 ## Verbosity is set to 1 (errors only)
#if str( $fastq_input.fastq_input_selector ) == "paired_iv": ## For interleaved fastq files set -p option
-p
#if str( $fastq_input.iset_stats ): ## check that insert statistics is used
-I "${fastq_input.iset_stats}"
#end if
#end if
#if str( $analysis_type.analysis_type_selector ) == "pacbio":
-x
#elif str( $analysis_type.analysis_type_selector ) == "full":
#if str( $analysis_type.algorithmic_options.algorithmic_options_selector ) == "set": ## Algorithmic options
-k "${analysis_type.algorithmic_options.k}"
-w "${analysis_type.algorithmic_options.w}"
-d "${analysis_type.algorithmic_options.d}"
-r "${analysis_type.algorithmic_options.r}"
-y "${analysis_type.algorithmic_options.y}"
-c "${analysis_type.algorithmic_options.c}"
-D "${analysis_type.algorithmic_options.D}"
-W "${analysis_type.algorithmic_options.W}"
-m "${analysis_type.algorithmic_options.m}"
${analysis_type.algorithmic_options.S}
${analysis_type.algorithmic_options.P}
${analysis_type.algorithmic_options.e}
#end if
#if str( $analysis_type.scoring_options.scoring_options_selector ) == "set": ## Scoring options
-A "${analysis_type.scoring_options.A}"
-B "${analysis_type.scoring_options.B}"
-O "${analysis_type.scoring_options.O}"
-E "${analysis_type.scoring_options.E}"
-L "${analysis_type.scoring_options.L}"
-U "${analysis_type.scoring_options.U}"
#end if
#if str( $analysis_type.io_options.io_options_selector ) == "set": ## IO options
-T "${analysis_type.io_options.T}"
${analysis_type.io_options.a}
${analysis_type.io_options.C}
${analysis_type.io_options.V}
${analysis_type.io_options.Y}
${analysis_type.io_options.M}
#end if
#end if
#if str( $rg.rg_selector ) == "set":
-R "@RG\tID:$rg.ID\tSM:$rg.SM"
#end if
#if str( $fastq_input.fastq_input_selector ) == "paired":
#if str( $fastq_input.iset_stats ): ## check that insert statistics is used
-I "${fastq_input.iset_stats}"
#end if
"${reference_fasta_filename}"
"${fastq_input.fastq_input1}" "${fastq_input.fastq_input2}"
#elif str( $fastq_input.fastq_input_selector ) == "paired_collection":
#if str( $fastq_input.iset_stats ): ## check that insert statistics is used
-I "${fastq_input.iset_stats}"
#end if
"${reference_fasta_filename}"
"${fastq_input.fastq_input1.forward}" "${fastq_input.fastq_input1.reverse}"
#else:
"${reference_fasta_filename}"
"${fastq_input.fastq_input1}"
#end if
| samtools view -Sb - > temporary_bam_file.bam &&
samtools sort -@ \${GALAXY_SLOTS:-1} temporary_bam_file.bam output
</command>
<inputs>
<conditional name="reference_source">
<param name="reference_source_selector" type="select" label="Load reference genome from">
<option value="cached">Local cache</option>
<option value="history">History</option>
</param>
<when value="cached">
<param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list">
<!-- options from_data_table="bwa_mem_indexes">
<filter type="sort_by" column="2" />
<validator type="no_options" message="No indexes are available" />
</options>
<validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/ -->
<options from_file="bwa_mem_indexes.loc">
<column name="value" index="0"/>
<column name="dbkey" index="1"/>
<column name="name" index="2"/>
<column name="path" index="3"/>
</options>
</param>
</when>
<when value="history">
<param name="ref_file" type="data" format="fasta" label="Use the following dataset as the reference sequence" help="You can upload a FASTA sequence to the history and use it as reference" />
</when>
</conditional>
<conditional name="fastq_input">
<param name="fastq_input_selector" type="select" label="Single or Paired-end reads" help="Select between paired and single end data">
<option value="paired">Paired</option>
<option value="single">Single</option>
<option value="paired_collection">Paired Collection</option>
<option value="paired_iv">Paired Interleaved</option>
</param>
<when value="paired">
<param name="fastq_input1" type="data" format="fastq" label="Select first set of reads" help="Specify dataset with forward reads"/>
<param name="fastq_input2" type="data" format="fastq" label="Select second set of reads" help="Specify dataset with reverse reads"/>
<param name="iset_stats" type="text" optional="True" size="10" label="Enter mean, standerd deviation, max, and min for insert lengths." help="-I; This parameter is only used for paired reads. Only mean is required while sd, max, and min will be inferred. Examples: both "250" and "250,25" will work while "250,,10" will not. See below for details.">
<sanitizer invalid_char="">
<valid initial="string.digits"><add value=","/> </valid>
</sanitizer>
</param>
</when>
<when value="single">
<param name="fastq_input1" type="data" format="fastq" label="Select fastq dataset" help="Specify dataset with single reads"/>
</when>
<when value="paired_collection">
<param name="fastq_input1" format="fastq" type="data_collection" collection_type="paired" label="Select a paired collection" help="See help section for an explanation of dataset collections"/>
<param name="iset_stats" type="text" optional="True" size="10" label="Enter mean, standerd deviation, max, and min for insert lengths." help="-I; This parameter is only used for paired reads. Only mean is required while sd, max, and min will be inferred. Examples: both "250" and "250,25" will work while "250,,10" will not. See below for details.">
<sanitizer invalid_char="">
<valid initial="string.digits"><add value=","/> </valid>
</sanitizer>
</param>
</when>
<when value="paired_iv">
<param name="fastq_input1" type="data" format="fastq" label="Select fastq dataset" help="Specify dataset with interleaved reads"/>
<param name="iset_stats" type="text" optional="True" size="10" label="Enter mean, standerd deviation, max, and min for insert lengths." help="-I; This parameter is only used for paired reads. Only mean is required while sd, max, and min will be inferred. Examples: both "250" and "250,25" will work while "250,,10" will not. See below for details.">
<sanitizer invalid_char="">
<valid initial="string.digits"><add value=","/> </valid>
</sanitizer>
</param>
</when>
</conditional>
<conditional name="rg">
<param name="rg_selector" type="select" label="Set read groups information?" help="Specifying readgroup information can greatly simplify your downstream analyses by allowing combining multiple datasets. See help below for more details">
<option value="set">Set</option>
<option value="do_not_set" selected="True">Do not set</option>
</param>
<when value="set">
<param name="ID" type="text" value="readgroup1" size="20" label="Specify readgroup ID" help="This value must be unique among multiple samples in your experiment">
</param>
<param name="SM" type="text" value="blood" size="20" label="Specify readgroup sample name (SM)" help="This value should be descriptive">
</param>
</when>
<when value="do_not_set">
<!-- do nothing -->
</when>
</conditional>
<conditional name="analysis_type">
<param name="analysis_type_selector" type="select" label="Select analysis mode">
<option value="illumina">1.Simple Illumina mode</option>
<option value="pacbio">2.PacBio mode</option>
<option value="full">3.Full list of options</option>
</param>
<when value="illumina">
<!-- do nothing -->
</when>
<when value="pacbio">
<!-- do nothing. all magic happens within <command> tag -->
</when>
<when value="full">
<conditional name="algorithmic_options">
<param name="algorithmic_options_selector" type="select" label="Set algorithmic options?" help="Sets -k, -w, -d, -r, -y, -c, -D, -W, -m, -S, -P, and -e options.">
<option value="set">Set</option>
<option value="do_not_set" selected="True">Do not set</option>
</param>
<when value="set">
<param name="k" type="integer" value="19" label="minimum seed length" help="-k; default=19"/>
<param name="w" type="integer" value="100" label="band width for banded alignment" help="-w; default=100"/>
<param name="d" type="integer" value="100" label="off-diagonal X-dropoff" help="-d; default=100"/>
<param name="r" type="float" value="1.5" label="look for internal seeds inside a seed longer than -k * THIS VALUE" help="-r; default=1.5"/>
<param name="y" type="integer" value="0" label="find maximum exact matches (MEMs) longer than -k * -r with size less than THIS VALUE" help="-y; default=0"/>
<param name="c" type="integer" value="500" label="skip seeds with more than that many occurrences" help="-c; default=500"/>
<param name="D" type="float" value="0.5" label="drop chains shorter than this fraction of the longest overlapping chain" help="-D; default=0.5"/>
<param name="W" type="integer" value="0" label="discard a chain if seeded bases shorter than" help="-W; default=0"/>
<param name="m" type="integer" value="50" label="perform at most this many rounds of mate rescues for each read" help="-m; default=50"/>
<param name="S" type="boolean" truevalue="-S" falsevalue="" label="skip mate rescue" help="-S"/>
<param name="P" type="boolean" truevalue="-P" falsevalue="" label="skip pairing; mate rescue performed unless -S also in use" help="-P"/>
<param name="e" type="boolean" truevalue="-e" falsevalue="" label="discard full-length exact matches" help="-e"/>
</when>
<when value="do_not_set">
<!-- do nothing -->
</when>
</conditional>
<conditional name="scoring_options">
<param name="scoring_options_selector" type="select" label="Set scoring options?" help="Sets -A, -B, -O, -E, -L, and -U options.">
<option value="set">Set</option>
<option value="do_not_set" selected="True">Do not set</option>
</param>
<when value="set">
<param name="A" type="integer" value="1" label="score for a sequence match" help="-A; scales options -T, -d, -B, -O, -E, -L, and -U; default=1"/>
<param name="B" type="integer" value="4" label="penalty for mismatch" help="-B; default=4"/>
<param name="O" type="text" value="6,6" label="gap open penalty for deletions and insertions" help="-O; default=6,6">
<sanitizer invalid_char="">
<valid initial="string.digits"><add value=","/> </valid>
</sanitizer>
</param>
<param name="E" type="text" value="1,1" label="gap extension penalty; a gap of size k cost '-O + -E*k' " help="-E; default=1,1">
<sanitizer invalid_char="">
<valid initial="string.digits"><add value=","/> </valid>
</sanitizer>
</param>
<param name="L" type="text" value="5,5" label="penalty for 5'-end and 3'-end clipping" help="-L; default=5,5">
<sanitizer invalid_char="">
<valid initial="string.digits"><add value=","/> </valid>
</sanitizer>
</param>
<param name="U" type="integer" value="17" label="penalty for an unpaired read pair" help="-U; default=17"/>
</when>
<when value="do_not_set">
<!-- do nothing -->
</when>
</conditional>
<conditional name="io_options">
<param name="io_options_selector" type="select" label="Set input/output options" help="Sets -T, -h, -a, -C, -V, -Y, and -M options.">
<option value="set">Set</option>
<option value="do_not_set" selected="True">Do not set</option>
</param>
<when value="set">
<param name="T" type="integer" value="30" label="minimum score to output" help="-T; default=30"/>
<param name="h" type="integer" value="5" label="if there are this many hits with score >80% of the max score, output all in XA tag" help="-h; default=5"/>
<param name="a" type="boolean" truevalue="-a" falsevalue="" label="output all alignments for single-ends or unpaired paired-ends" help="-a"/>
<param name="C" type="boolean" truevalue="-C" falsevalue="" label="append FASTA/FASTQ comment to BAM output" help="-C"/>
<param name="V" type="boolean" truevalue="-V" falsevalue="" label="output the reference FASTA header in the XR tag" help="-C"/>
<param name="Y" type="boolean" truevalue="-Y" falsevalue="" label="use soft clipping for supplementary alignments" help="-Y"/>
<param name="M" type="boolean" truevalue="-M" falsevalue="" label="mark shorter split hits as secondary" help="-M"/>
</when>
<when value="do_not_set">
<!-- do nothing -->
</when>
</conditional>
</when>
</conditional>
</inputs>
<outputs>
<data format="bam" name="bam_output" from_work_dir="output.bam" label="${tool.name} on ${on_string} (mapped reads in BAM format)"/>
</outputs>
<tests>
<test>
<param name="reference_source_selector" value="history" />
<param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
<param name="fastq_input_selector" value="paired"/>
<param name="fastq_input1" ftype="fastq" value="bwa-mem-fastq1.fq"/>
<param name="fastq_input2" ftype="fastq" value="bwa-mem-fastq2.fq"/>
<param name="analysis_type_selector" value="illumina"/>
<output name="bam_output" ftype="bam" file="bwa-mem-test1.bam" lines_diff="2" />
</test>
</tests>
<stdio>
<exit_code range="1:" />
</stdio>
<help>
**What is does**
From http://arxiv.org/abs/1303.3997:
BWA-MEM is a new alignment algorithm for aligning sequence reads or long query sequences against a large reference genome such as human.
It automatically chooses between local and end-to-end alignments, supports paired-end reads and performs chimeric alignment.
The algorithm is robust to sequencing errors and applicable to a wide range of sequence lengths from 70bp to a few megabases.
For mapping 100bp sequences, BWA-MEM shows better performance than several state-of-art read aligners to date.
It is best suited for mapping long (>70 nt) reads against large reference genomes.
This Galaxy tool wraps bwa-mem module of bwa read mapping tool. Galaxy implementation takes fastq files as input and produces output in BAM (not SAM) format, which can be further processed using various BAM utilities exiting in Galaxy (BAMTools, SAMTools, Picard).
-----
**Galaxy-specific option**
Galaxy allows four levels of control over bwa-mem options provided by **Select analysis mode** menu option. These are:
1. *Simple Illumina mode*: The simplest possible bwa mem application in which it alignes single or paired-end data to reference using default parameters. It is equivalent to the following command: bwa mem <reference index> <fastq dataset1> [fastq dataset2]
2. *PacBio mode*: The mode adjusted specifically for mapping of long PacBio subreads. Equivalent to the following command: bwa mem -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 <reference index> <PacBio dataset in fastq format>
3. *Full list of options*: Allows access to all options through Galaxy interface.
------
**BWA MEM options**
Each Galaxy parameter widget corresponds to command line flags listed below:
Algorithm options::
-k INT minimum seed length [19]
-w INT band width for banded alignment [100]
-d INT off-diagonal X-dropoff [100]
-r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [1.5]
-y INT find MEMs longer than {-k} * {-r} with size less than INT [0]
-c INT skip seeds with more than INT occurrences [500]
-D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [0.50]
-W INT discard a chain if seeded bases shorter than INT [0]
-m INT perform at most INT rounds of mate rescues for each read [50]
-S skip mate rescue
-P skip pairing; mate rescue performed unless -S also in use
-e discard full-length exact matches
Scoring options::
-A INT score for a sequence match, which scales options -TdBOELU unless overridden [1]
-B INT penalty for a mismatch [4]
-O INT[,INT] gap open penalties for deletions and insertions [6,6]
-E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [1,1]
-L INT[,INT] penalty for 5'- and 3'-end clipping [5,5]
-U INT penalty for an unpaired read pair [17]
Input/output options::
-p first query file consists of interleaved paired-end sequences
-R STR read group header line such as '@RG\tID:foo\tSM:bar' [null]
-v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [3]
-T INT minimum score to output [30]
-a output all alignments for SE or unpaired PE
-C append FASTA/FASTQ comment to SAM output
-V output the reference FASTA header in the XR tag
-Y use soft clipping for supplementary alignments
-M mark shorter split hits as secondary
-I FLOAT[,FLOAT[,INT[,INT]]]
specify the mean, standard deviation (10% of the mean if absent), max
(4 sigma from the mean if absent) and min of the insert size distribution.
FR orientation only. [inferred]
</help>
<citations>
<citation type="doi">10.1093/bioinformatics/btp324</citation>
<citation type="doi">10.1093/bioinformatics/btp698</citation>
<citation type="bibtex">@misc{1303.3997,
Author = {Heng Li},
Title = {Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM},
Year = {2013},
Eprint = {arXiv:1303.3997},
url = {http://arxiv.org/abs/1303.3997},
}</citation>
</citations>
</tool>