Skip to content

Commit

Permalink
Added alternate sort and duplicate marking tools (resolves BD2KGenomi…
Browse files Browse the repository at this point in the history
…cs#95)

Adds the following sorters and duplicate markers for benchmarking purposes:

* Sorters:
  * Picard SortSam
  * sambamba sort
* Duplicate markers:
  * sambamba mkdup
  * SAMBLASTER
  * samtools rmdup
  • Loading branch information
fnothaft committed Aug 7, 2017
1 parent 3dfe281 commit 766ced8
Showing 1 changed file with 171 additions and 0 deletions.
171 changes: 171 additions & 0 deletions src/toil_lib/tools/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,29 @@ def run_samtools_index(job, bam):
return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sample.bam.bai'))



def run_samtools_view(job, bam):
"""
Runs SAMtools view to create a SAM file.
:param JobFunctionWrappingJob job: passed automatically by Toil
:param str bam: FileStoreID of the BAM file
:return: FileStoreID for SAM file
:rtype: str
"""
work_dir = job.fileStore.getLocalTempDir()
job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'sample.bam'))
# Call: index the bam
parameters = ['view',
'-h',
'-o', '/data/sample.sam',
'/data/sample.bam']
dockerCall(job=job, workDir=work_dir, parameters=parameters,
tool='quay.io/ucsc_cgl/samtools:0.1.19--dd5ac549b95eb3e5d166a5e310417ef13651994e')
# Write to fileStore
return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sample.sam'))


def _log_runtime(job, start, end, cmd):

elapsed_time = end - start
Expand Down Expand Up @@ -121,6 +144,111 @@ def run_samtools_sort(job, bam):
return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.bam'))


def run_samtools_rmdup(job, bam):
"""
Mark reads as PCR duplicates using SAMtools rmdup
:param JobFunctionWrappingJob job: passed automatically by Toil
:param str bam: FileStoreID for BAM file
:return: FileStoreID for sorted BAM file
:rtype: str
"""
work_dir = job.fileStore.getLocalTempDir()
job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'input.bam'))
command = ['rmdup',
'/data/input.bam',
'/data/output.bam']

start_time = time.time()
dockerCall(job=job, workDir=work_dir,
parameters=command,
tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c')
end_time = time.time()
_log_runtime(job, start_time, end_time, "samtools rmdup")
return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.bam'))


def run_sambamba_sort(job, bam, sort_by_name=False):
"""
Sorts BAM file using Sambamba sort
:param JobFunctionWrappingJob job: passed automatically by Toil
:param str bam: FileStoreID for BAM file
:param boolean sort_by_name: If true, sorts by read name instead of coordinate.
:return: FileStoreID for sorted BAM file
:rtype: str
"""
work_dir = job.fileStore.getLocalTempDir()
job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'input.bam'))
command = ['/usr/local/bin/sambamba',
'sort',
'-t', str(int(job.cores)),
'-m', str(job.memory),
'-o', '/data/output.bam',
'/data/input.bam']

if sort_by_name:
command.append('-n')

start_time = time.time()
dockerCall(job=job, workDir=work_dir,
parameters=command,
tool='quay.io/biocontainers/sambamba:0.6.6--0')
end_time = time.time()
_log_runtime(job, start_time, end_time, "sambamba sort")
return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.bam'))


def run_sambamba_markdup(job, bam):
"""
Marks reads as PCR duplicates using Sambamba
:param JobFunctionWrappingJob job: passed automatically by Toil
:param str bam: FileStoreID for BAM file
:return: FileStoreID for sorted BAM file
:rtype: str
"""
work_dir = job.fileStore.getLocalTempDir()
job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'input.bam'))
command = ['/usr/local/bin/sambamba',
'markdup',
'-t', str(int(job.cores)),
'/data/input.bam',
'/data/output.bam']

start_time = time.time()
dockerCall(job=job, workDir=work_dir,
parameters=command,
tool='quay.io/biocontainers/sambamba:0.6.6--0')
end_time = time.time()
_log_runtime(job, start_time, end_time, "sambamba mkdup")
return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.bam'))


def run_samblaster(job, bam):
"""
Marks reads as PCR duplicates using SAMBLASTER
:param JobFunctionWrappingJob job: passed automatically by Toil
:param str bam: FileStoreID for BAM file
:return: FileStoreID for sorted BAM file
:rtype: str
"""
work_dir = job.fileStore.getLocalTempDir()
job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'input.bam'))
command = ['/usr/local/bin/samblaster',
'-s', '/data/input.bam',
'-o', '/data/output.bam']

start_time = time.time()
dockerCall(job=job, workDir=work_dir,
parameters=command,
tool='quay.io/biocontainers/samblaster:0.1.24--0')
end_time = time.time()
_log_runtime(job, start_time, end_time, "SAMBLASTER")
return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.bam'))


def run_picard_create_sequence_dictionary(job, ref_id):
"""
Uses Picard to create reference sequence dictionary
Expand Down Expand Up @@ -186,6 +314,44 @@ def picard_mark_duplicates(job, bam, bai, validation_stringency='LENIENT'):
return bam, bai


def run_picard_sort(job, bam, sort_by_name=False):
"""
Sorts BAM file using Picard SortSam
:param JobFunctionWrappingJob job: passed automatically by Toil
:param str bam: FileStoreID for BAM file
:param boolean sort_by_name: If true, sorts by read name instead of coordinate.
:return: FileStoreID for sorted BAM file
:rtype: str
"""
work_dir = job.fileStore.getLocalTempDir()
job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'input.bam'))
command = ['SortSam',
'O=/data/output.bam',
'I=/data/input.bam']

# picard-tools container doesn't have JAVA_OPTS variable
# Set TMPDIR to /data to prevent writing temporary files to /tmp
docker_parameters = ['--rm',
'--log-driver', 'none',
'-e', 'JAVA_OPTIONS=-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory),
'-v', '{}:/data'.format(work_dir)]

if sort_by_name:
command.append('SO=queryname')
else:
command.append('SO=coordinate')

start_time = time.time()
dockerCall(job=job, workDir=work_dir,
parameters=command,
tool='quay.io/ucsc_cgl/picardtools:1.95--dd5ac549b95eb3e5d166a5e310417ef13651994e',
dockerParameters=docker_parameters)
end_time = time.time()
_log_runtime(job, start_time, end_time, "Picard SortSam")
return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.bam'))


def run_gatk_preprocessing(job, bam, bai, ref, ref_dict, fai, g1k, mills, dbsnp, realign=False, unsafe=False):
"""
GATK Preprocessing Pipeline
Expand Down Expand Up @@ -365,6 +531,9 @@ def run_realigner_target_creator(job, bam, bai, ref, ref_dict, fai, g1k, mills,
'--downsampling_type', 'NONE',
'-o', '/data/sample.intervals']

end_time = time.time()
_log_runtime(job, start_time, end_time, "GATK3 RealignerTargetCreator")

if unsafe:
parameters.extend(['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY'])

Expand Down Expand Up @@ -542,6 +711,8 @@ def apply_bqsr_recalibration(job, table, bam, bai, ref, ref_dict, fai, unsafe=Fa
'-I', '/data/input.bam',
'-BQSR', '/data/recal.table',
'-o', '/data/bqsr.bam']
end_time = time.time()
_log_runtime(job, start_time, end_time, "GATK3 BQSR PrintReads")

if unsafe:
parameters.extend(['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY'])
Expand Down

0 comments on commit 766ced8

Please sign in to comment.