Skip to content

Commit

Permalink
feat: Fixing Mantis (#427)
Browse files Browse the repository at this point in the history
Co-authored-by: giacuong171 <giacuong171@gmail.com>
Co-authored-by: Eric Blanc <eric.blanc@bihealth.de>
Co-authored-by: ericblanc20 <eric.blanc@bih-charite.de>
  • Loading branch information
4 people authored Apr 17, 2024
1 parent 8fb40e5 commit e008e7e
Show file tree
Hide file tree
Showing 5 changed files with 210 additions and 110 deletions.
22 changes: 11 additions & 11 deletions snappy_pipeline/workflows/somatic_msi_calling/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -51,21 +51,21 @@ rule somatic_msi_calling_link_out_run:

# Somatic Microsatellite Instability analysis~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Run MANTIS ------------------------------------------------------------------
# Run MANTIS_msi2 ------------------------------------------------------------------


rule somatic_msi_calling_mantis:
rule somatic_msi_calling_mantis_msi2:
input:
unpack(wf.get_input_files("mantis", "run")),
unpack(wf.get_input_files("mantis_msi2", "run")),
output:
**wf.get_output_files("mantis", "run"),
threads: wf.get_resource("mantis", "run", "threads")
**wf.get_output_files("mantis_msi2", "run"),
threads: wf.get_resource("mantis_msi2", "run", "threads")
resources:
time=wf.get_resource("mantis", "run", "time"),
memory=wf.get_resource("mantis", "run", "memory"),
partition=wf.get_resource("mantis", "run", "partition"),
tmpdir=wf.get_resource("mantis", "run", "tmpdir"),
time=wf.get_resource("mantis_msi2", "run", "time"),
memory=wf.get_resource("mantis_msi2", "run", "memory"),
partition=wf.get_resource("mantis_msi2", "run", "partition"),
tmpdir=wf.get_resource("mantis_msi2", "run", "tmpdir"),
log:
wf.get_log_file("mantis", "run"),
**wf.get_log_file("mantis_msi2", "run"),
wrapper:
wf.wrapper_path("mantis/run")
wf.wrapper_path("mantis/mantis_msi2/run")
167 changes: 112 additions & 55 deletions snappy_pipeline/workflows/somatic_msi_calling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
whole genomes, exomes or large panels). MANTIS starts from the aligned reads
(thus off ``ngs_mapping``) and generates a result file per tumor/normal pair.
As MANTIS is not maintained anymore, the pipeline now supports only
`MANTIS2 <https://github.com/nh13/MANTIS2>`_.
The new version appears to be very silimar to the old one, both in terms of input & output files,
and in terms of requirements.
==========
Step Input
==========
Expand All @@ -19,13 +24,13 @@
.. note:: Tool-Specific Output
As the only integrated tool is MANTIS at the moment, the output is very tailored to the result
As the only integrated tool is MANTIS2 at the moment, the output is very tailored to the result
of this tool. In the future, this section might contain "common" output and tool-specific
output sub sections.
- ``mantis.{mapper}.{lib_name}-{lib_pk}/out/``
- ``mantis.{mapper}.{lib_name}-{lib_pk}.results.txt``
- ``mantis.{mapper}.{lib_name}-{lib_pk}.results.txt.status``
- ``{mapper}.mantis_msi2.{lib_name}-{lib_pk}/out/``
- ``{mapper}.mantis_msi2.{lib_name}-{lib_pk}.results.txt``
- ``{mapper}.mantis_msi2.{lib_name}-{lib_pk}.results.txt.status``
=====================
Default Configuration
Expand All @@ -39,14 +44,15 @@
Available Somatic Targeted CNV Caller
=====================================
- ``mantis``
- ``mantis_msi2``
"""

from collections import OrderedDict
import os
import sys

from biomedsheets.shortcuts import CancerCaseSheet, is_not_background
from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background
from snakemake.io import expand

from snappy_pipeline.utils import dictify, listify
Expand All @@ -60,30 +66,50 @@

__author__ = "Clemens Messerschmidt <clemens.messerschmidt@bih-charite.de>"

#: Extensions of files to create as main payload
EXT_VALUES = (".results.txt", ".results.txt.status", ".results.txt.md5", ".results.txt.status.md5")

#: Names of the files to create for the extension
EXT_NAMES = ("result", "status", "result_md5", "status_md5")

EXT_MATCHED = {
"mantis_msi2": {
"result": ".results.txt",
"status": ".results.txt.status",
"result_md5": ".results.txt.md5",
"status_md5": ".results.txt.status.md5",
},
}

#: Available somatic variant callers assuming matched samples.
MSI_CALLERS_MATCHED = ("mantis_msi2",)


#: Default configuration for the somatic_msi_calling step
DEFAULT_CONFIG = r"""
# Default configuration somatic_msi_calling
step_config:
somatic_msi_calling:
tools: ['mantis'] # REQUIRED - available: 'mantis'
tools: ['mantis_msi2'] # REQUIRED - available: 'mantis_msi2'
path_ngs_mapping: ../ngs_mapping # REQUIRED
loci_bed: /fast/groups/cubi/projects/biotools/Mantis/appData/hg19/loci.bed # REQUIRED
loci_bed: "" # REQUIRED
# hg19: /fast/work/groups/cubi/projects/biotools/Mantis/appData/hg19/loci.bed
# hg38: /fast/work/groups/cubi/projects/biotools/Mantis/appData/hg38/GRCh38.d1.vd1.all_loci.bed
"""


class MantisStepPart(BaseStepPart):
"""Perform somatic microsatellite instability with MANTIS"""
class Mantis2StepPart(BaseStepPart):
"""Perform somatic microsatellite instability with MANTIS_msi2"""

#: Step name
name = "mantis"
name = "mantis_msi2"

#: Class available actions
actions = ("run",)

def __init__(self, parent):
super().__init__(parent)
self.base_path_out = (
"work/mantis.{{mapper}}.{{library_name}}/out/" "mantis.{{mapper}}.{{library_name}}{ext}"
"work/{{mapper}}.{msi_caller}.{{tumor_library}}/out/"
"{{mapper}}.{msi_caller}.{{tumor_library}}{ext}"
)
# Build shortcut from cancer bio sample name to matched cancer sample
self.tumor_ngs_library_to_sample_pair = OrderedDict()
Expand All @@ -92,12 +118,10 @@ def __init__(self, parent):
sheet.all_sample_pairs_by_tumor_dna_ngs_library
)

def get_normal_lib_name(self, wildcards):
"""Return name of normal (non-cancer) library"""
pair = self.tumor_ngs_library_to_sample_pair[wildcards.library_name]
return pair.normal_sample.dna_ngs_library.name

def get_input_files(self, action):
# Validate action
self._validate_action(action)

def input_function(wildcards):
"""Helper wrapper function"""
# Get shorcut to Snakemake sub workflow
Expand All @@ -110,7 +134,7 @@ def input_function(wildcards):
)
)
tumor_base_path = (
"output/{mapper}.{library_name}/out/" "{mapper}.{library_name}"
"output/{mapper}.{tumor_library}/out/" "{mapper}.{tumor_library}"
).format(**wildcards)
return {
"normal_bam": ngs_mapping(normal_base_path + ".bam"),
Expand All @@ -119,24 +143,38 @@ def input_function(wildcards):
"tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"),
}

assert action == "run", "Unsupported actions"
return input_function

@dictify
def get_normal_lib_name(self, wildcards):
"""Return name of normal (non-cancer) library"""
pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library]
return pair.normal_sample.dna_ngs_library.name

def get_output_files(self, action):
assert action == "run", "Unsupported actions"
exts = {"result": "results.txt", "status": "results.txt.status"}
for key, ext in exts.items():
yield key, (
"work/mantis.{{mapper}}.{{library_name}}/out/"
"mantis.{{mapper}}.{{library_name}}_{sfx}"
).format(sfx=ext)

@staticmethod
def get_log_file(action):
"""Return path to log file for the given action"""
_ = action
return "work/mantis.{mapper}.{library_name}/log/snakemake.mantis_run.log"
# Validate action
self._validate_action(action)
return dict(
zip(EXT_NAMES, expand(self.base_path_out, msi_caller=[self.name], ext=EXT_VALUES))
)

@dictify
def _get_log_file(self, action):
"""Return dict of log files."""
# Validate action
self._validate_action(action)

prefix = (
"work/{{mapper}}.{msi_caller}.{{tumor_library}}/log/"
"{{mapper}}.{msi_caller}.{{tumor_library}}"
).format(msi_caller=self.__class__.name)
key_ext = (
("log", ".log"),
("conda_info", ".conda_info.txt"),
("conda_list", ".conda_list.txt"),
)
for key, ext in key_ext:
yield key, prefix + ext
yield key + "_md5", prefix + ext + ".md5"

def get_resource_usage(self, action):
"""Get Resource Usage
Expand All @@ -150,7 +188,7 @@ def get_resource_usage(self, action):
self._validate_action(action)
return ResourceUsage(
threads=3,
time="08:00:00", # 2 hours
time="08:00:00", # 8 hours
memory=f"{30 * 1024 * 3}M",
)

Expand All @@ -164,9 +202,13 @@ class SomaticMsiCallingWorkflow(BaseStep):
#: Default biomed sheet class
sheet_shortcut_class = CancerCaseSheet

sheet_shortcut_kwargs = {
"options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True)
}

@classmethod
def default_config_yaml(cls):
"""Return default config YAML, to be overwritten by project-specific one"""
"""Return default config YAML, to be overwritten by project-specific one."""
return DEFAULT_CONFIG

def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir):
Expand All @@ -179,14 +221,41 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
(NgsMappingWorkflow,),
)
# Register sub step classes so the sub steps are available
self.register_sub_step_classes((MantisStepPart, LinkOutStepPart))
self.register_sub_step_classes((Mantis2StepPart, LinkOutStepPart))
# Initialize sub-workflows
self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"])

@listify
def get_result_files(self):
"""Return list of result files for the somatic targeted sequencing CNV calling step"""
tool_actions = {"mantis": ("run",)}
"""Return list of result files for the MSI calling workflow"""
name_pattern = "{mapper}.{msi_caller}.{tumor_library.name}"
for msi_caller in set(self.config["tools"]) & set(MSI_CALLERS_MATCHED):
yield from self._yield_result_files_matched(
os.path.join("output", name_pattern, "out", name_pattern + "{ext}"),
mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
msi_caller=msi_caller,
ext=EXT_MATCHED[msi_caller].values() if msi_caller in EXT_MATCHED else EXT_VALUES,
)
yield from self._yield_result_files_matched(
os.path.join("output", name_pattern, "log", name_pattern + "{ext}"),
mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
msi_caller=msi_caller,
ext=(
".log",
".log.md5",
".conda_info.txt",
".conda_info.txt.md5",
".conda_list.txt",
".conda_list.txt.md5",
),
)

def _yield_result_files_matched(self, tpl, **kwargs):
"""Build output paths from path template and extension list.
This function returns the results from the matched msi callers such as
mantis.
"""
for sheet in filter(is_not_background, self.shortcut_sheets):
for sample_pair in sheet.all_sample_pairs:
if (
Expand All @@ -199,21 +268,9 @@ def get_result_files(self):
)
print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr)
continue
for tool in self.config["tools"]:
for action in tool_actions[tool]:
try:
tpls = self.sub_steps[tool].get_output_files(action).values()
except AttributeError:
tpls = self.sub_steps[tool].get_output_files(action)
for tpl in tpls:
filenames = expand(
tpl,
mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
library_name=[sample_pair.tumor_sample.dna_ngs_library.name],
)
for f in filenames:
if ".tmp." not in f:
yield f.replace("work/", "output/")
yield from expand(
tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs
)

def check_config(self):
"""Check that the necessary globalc onfiguration is present"""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
channels:
- conda-forge
- anaconda
- bioconda
dependencies:
- numpy ==1.6.2
- pysam ==0.8.3
- mantis-msi2
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,21 @@
set -x
# Also pipe everything to log file
if [[ -n "{snakemake.log}" ]]; then
if [[ -n "{snakemake.log.log}" ]]; then
if [[ "$(set +e; tty; set -e)" != "" ]]; then
rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log})
exec &> >(tee -a "{snakemake.log}" >&2)
rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log.log})
exec &> >(tee -a "{snakemake.log.log}" >&2)
else
rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log})
echo "No tty, logging disabled" >"{snakemake.log}"
rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log.log})
echo "No tty, logging disabled" >"{snakemake.log.log}"
fi
fi
conda list >{snakemake.log.conda_list}
conda info >{snakemake.log.conda_info}
md5sum {snakemake.log.conda_list} | sed -re "s/ (\.?.+\/)([^\/]+)$/ \2/" > {snakemake.log.conda_list}.md5
md5sum {snakemake.log.conda_info} | sed -re "s/ (\.?.+\/)([^\/]+)$/ \2/" > {snakemake.log.conda_info}.md5
# Setup auto-cleaned TMPDIR
export TMPDIR=$(mktemp -d)
trap "rm -rf $TMPDIR" EXIT
Expand All @@ -32,8 +37,10 @@
# The following config is recommended by the authors for WES,
# but should also work for reasonable deep WGS according to them.
# https://github.com/OSU-SRLab/MANTIS/issues/25
#
# Only one thread is used, see https://github.com/OSU-SRLab/MANTIS/issues/57
python /fast/groups/cubi/projects/biotools/Mantis/MANTIS/mantis.py \
mantis-msi2 \
-t {snakemake.input.tumor_bam} \
-n {snakemake.input.normal_bam} \
--genome {snakemake.config[static_data_config][reference][path]} \
Expand All @@ -43,7 +50,7 @@
--min-locus-quality 25.0 \
--min-locus-coverage 20 \
--min-repeat-reads 1 \
--threads 3 \
--threads 1 \
-o $TMPDIR/out/$(basename {snakemake.output.result})
pushd $TMPDIR/out
Expand All @@ -55,3 +62,11 @@
mv $TMPDIR/out/* $(dirname {snakemake.output.result})
"""
)

# Compute MD5 sums of logs.
shell(
r"""
sleep 1s # try to wait for log file flush
md5sum {snakemake.log.log} >{snakemake.log.log_md5}
"""
)
Loading

0 comments on commit e008e7e

Please sign in to comment.