From 766ced837e1ad1f01837b93927a393292f6a8b5e Mon Sep 17 00:00:00 2001 From: Frank Austin Nothaft Date: Sat, 5 Aug 2017 14:15:22 -0700 Subject: [PATCH] Added alternate sort and duplicate marking tools (resolves #95) Adds the following sorters and duplicate markers for benchmarking purposes: * Sorters: * Picard SortSam * sambamba sort * Duplicate markers: * sambamba mkdup * SAMBLASTER * samtools rmdup --- src/toil_lib/tools/preprocessing.py | 171 ++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) diff --git a/src/toil_lib/tools/preprocessing.py b/src/toil_lib/tools/preprocessing.py index 3692ad0..36494c6 100644 --- a/src/toil_lib/tools/preprocessing.py +++ b/src/toil_lib/tools/preprocessing.py @@ -85,6 +85,29 @@ def run_samtools_index(job, bam): return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sample.bam.bai')) + +def run_samtools_view(job, bam): + """ + Runs SAMtools view to create a SAM file. + + :param JobFunctionWrappingJob job: passed automatically by Toil + :param str bam: FileStoreID of the BAM file + :return: FileStoreID for SAM file + :rtype: str + """ + work_dir = job.fileStore.getLocalTempDir() + job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'sample.bam')) + # Call: index the bam + parameters = ['view', + '-h', + '-o', '/data/sample.sam', + '/data/sample.bam'] + dockerCall(job=job, workDir=work_dir, parameters=parameters, + tool='quay.io/ucsc_cgl/samtools:0.1.19--dd5ac549b95eb3e5d166a5e310417ef13651994e') + # Write to fileStore + return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sample.sam')) + + def _log_runtime(job, start, end, cmd): elapsed_time = end - start @@ -121,6 +144,111 @@ def run_samtools_sort(job, bam): return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.bam')) +def run_samtools_rmdup(job, bam): + """ + Mark reads as PCR duplicates using SAMtools rmdup + + :param JobFunctionWrappingJob job: passed automatically by Toil + :param str bam: FileStoreID for BAM file + :return: FileStoreID for sorted BAM file + :rtype: str + """ + work_dir = job.fileStore.getLocalTempDir() + job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'input.bam')) + command = ['rmdup', + '/data/input.bam', + '/data/output.bam'] + + start_time = time.time() + dockerCall(job=job, workDir=work_dir, + parameters=command, + tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c') + end_time = time.time() + _log_runtime(job, start_time, end_time, "samtools rmdup") + return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.bam')) + + +def run_sambamba_sort(job, bam, sort_by_name=False): + """ + Sorts BAM file using Sambamba sort + + :param JobFunctionWrappingJob job: passed automatically by Toil + :param str bam: FileStoreID for BAM file + :param boolean sort_by_name: If true, sorts by read name instead of coordinate. + :return: FileStoreID for sorted BAM file + :rtype: str + """ + work_dir = job.fileStore.getLocalTempDir() + job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'input.bam')) + command = ['/usr/local/bin/sambamba', + 'sort', + '-t', str(int(job.cores)), + '-m', str(job.memory), + '-o', '/data/output.bam', + '/data/input.bam'] + + if sort_by_name: + command.append('-n') + + start_time = time.time() + dockerCall(job=job, workDir=work_dir, + parameters=command, + tool='quay.io/biocontainers/sambamba:0.6.6--0') + end_time = time.time() + _log_runtime(job, start_time, end_time, "sambamba sort") + return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.bam')) + + +def run_sambamba_markdup(job, bam): + """ + Marks reads as PCR duplicates using Sambamba + + :param JobFunctionWrappingJob job: passed automatically by Toil + :param str bam: FileStoreID for BAM file + :return: FileStoreID for sorted BAM file + :rtype: str + """ + work_dir = job.fileStore.getLocalTempDir() + job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'input.bam')) + command = ['/usr/local/bin/sambamba', + 'markdup', + '-t', str(int(job.cores)), + '/data/input.bam', + '/data/output.bam'] + + start_time = time.time() + dockerCall(job=job, workDir=work_dir, + parameters=command, + tool='quay.io/biocontainers/sambamba:0.6.6--0') + end_time = time.time() + _log_runtime(job, start_time, end_time, "sambamba mkdup") + return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.bam')) + + +def run_samblaster(job, bam): + """ + Marks reads as PCR duplicates using SAMBLASTER + + :param JobFunctionWrappingJob job: passed automatically by Toil + :param str bam: FileStoreID for BAM file + :return: FileStoreID for sorted BAM file + :rtype: str + """ + work_dir = job.fileStore.getLocalTempDir() + job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'input.bam')) + command = ['/usr/local/bin/samblaster', + '-s', '/data/input.bam', + '-o', '/data/output.bam'] + + start_time = time.time() + dockerCall(job=job, workDir=work_dir, + parameters=command, + tool='quay.io/biocontainers/samblaster:0.1.24--0') + end_time = time.time() + _log_runtime(job, start_time, end_time, "SAMBLASTER") + return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.bam')) + + def run_picard_create_sequence_dictionary(job, ref_id): """ Uses Picard to create reference sequence dictionary @@ -186,6 +314,44 @@ def picard_mark_duplicates(job, bam, bai, validation_stringency='LENIENT'): return bam, bai +def run_picard_sort(job, bam, sort_by_name=False): + """ + Sorts BAM file using Picard SortSam + + :param JobFunctionWrappingJob job: passed automatically by Toil + :param str bam: FileStoreID for BAM file + :param boolean sort_by_name: If true, sorts by read name instead of coordinate. + :return: FileStoreID for sorted BAM file + :rtype: str + """ + work_dir = job.fileStore.getLocalTempDir() + job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'input.bam')) + command = ['SortSam', + 'O=/data/output.bam', + 'I=/data/input.bam'] + + # picard-tools container doesn't have JAVA_OPTS variable + # Set TMPDIR to /data to prevent writing temporary files to /tmp + docker_parameters = ['--rm', + '--log-driver', 'none', + '-e', 'JAVA_OPTIONS=-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory), + '-v', '{}:/data'.format(work_dir)] + + if sort_by_name: + command.append('SO=queryname') + else: + command.append('SO=coordinate') + + start_time = time.time() + dockerCall(job=job, workDir=work_dir, + parameters=command, + tool='quay.io/ucsc_cgl/picardtools:1.95--dd5ac549b95eb3e5d166a5e310417ef13651994e', + dockerParameters=docker_parameters) + end_time = time.time() + _log_runtime(job, start_time, end_time, "Picard SortSam") + return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.bam')) + + def run_gatk_preprocessing(job, bam, bai, ref, ref_dict, fai, g1k, mills, dbsnp, realign=False, unsafe=False): """ GATK Preprocessing Pipeline @@ -365,6 +531,9 @@ def run_realigner_target_creator(job, bam, bai, ref, ref_dict, fai, g1k, mills, '--downsampling_type', 'NONE', '-o', '/data/sample.intervals'] + end_time = time.time() + _log_runtime(job, start_time, end_time, "GATK3 RealignerTargetCreator") + if unsafe: parameters.extend(['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY']) @@ -542,6 +711,8 @@ def apply_bqsr_recalibration(job, table, bam, bai, ref, ref_dict, fai, unsafe=Fa '-I', '/data/input.bam', '-BQSR', '/data/recal.table', '-o', '/data/bqsr.bam'] + end_time = time.time() + _log_runtime(job, start_time, end_time, "GATK3 BQSR PrintReads") if unsafe: parameters.extend(['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY'])