Skip to content

Commit

Permalink
Develop -> master (#389)
Browse files Browse the repository at this point in the history
  • Loading branch information
Maarten-vd-Sande authored Jun 11, 2020
1 parent d421ce6 commit cf045b3
Show file tree
Hide file tree
Showing 23 changed files with 249 additions and 194 deletions.
8 changes: 7 additions & 1 deletion bin/seq2science
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ try:
except ImportError:
full_import = False

try:
import mamba
conda_frontend = "mamba"
except ImportError:
conda_frontend = "conda"


__version__ = "0.0.0"

Expand Down Expand Up @@ -181,7 +187,7 @@ def _run(args, base_dir, workflows_dir, config_path):
parsed_args = {"snakefile": os.path.join(workflows_dir, args.workflow, "Snakefile"),
"cores": args.cores,
"use_conda": True,
"conda_frontend": "mamba",
"conda_frontend": conda_frontend,
"conda_prefix": os.path.join(base_dir, ".snakemake"),
"dryrun": args.dryrun}

Expand Down
4 changes: 1 addition & 3 deletions requirements.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,11 @@ dependencies:
- bioconda::sra-tools=2.9.1
- bioconda::entrez-direct=11.0
- bioconda::pysam=0.15.3
- pkgs/main::packaging=19.1
- conda-forge::pygithub=1.43.6
- pkgs/main::conda=4.7.11
- bioconda::norns=0.1.5
- anaconda::biopython=1.74
- pkgs/main::filelock=3.0.12
- pkgs/mean::pyyaml
- pkgs/main::pyyaml=5.3.1
- pkgs/main::beautifulsoup4=4.9.0
- conda-forge:pretty_html_table=0.9.dev0
- bioconda::trackhub=0.1.2019.12.24
2 changes: 1 addition & 1 deletion seq2science/envs/samtools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ channels:
- conda-forge
- defaults
dependencies:
- bioconda::samtools=1.9
- bioconda::samtools=1.10
42 changes: 17 additions & 25 deletions seq2science/rules/DGE_analysis.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,27 @@ def get_contrasts():
"""
splits contrasts that contain multiple comparisons
"""
if not config.get('contrasts', False):
if 'contrasts' not in config:
return []

# contrasts from config
old_contrasts = list(config["contrasts"])

new_contrasts = []
for contrast in old_contrasts:
original_contrast, contrast, batch = parse_DE_contrasts(contrast)

l = len(contrast)
if l == 1:
# write out the full contrast
lvls = samples[contrast[0]].dropna().unique().tolist()
new_contrast = batch + '+' + contrast[0] + '_' + lvls[0] + '_' + lvls[1] if batch is not None else contrast[0] + '_' + lvls[0] + '_' + lvls[1]
new_contrasts.append(new_contrast)
elif l == 2 or (l == 3 and 'all' in contrast[1:]):
for contrast in list(config["contrasts"]):
parsed_contrast, batch = parse_de_contrasts(contrast)
column_name = parsed_contrast[0]
components = len(parsed_contrast)
if components == 2 or (components == 3 and 'all' in contrast[1:]):
# create a list of all contrasts designed by the 'groupA vs all' design
reflvl = str(contrast[2]) if contrast[1] == 'all' else str(contrast[1])
lvls = samples[contrast[0]].dropna().unique().tolist()
lvls = list(map(lambda x: str(x), lvls))
reflvl = str(parsed_contrast[2]) if parsed_contrast[1] == 'all' else str(parsed_contrast[1])
lvls = [str(lvl) for lvl in samples[column_name].dropna().unique().tolist()]
lvls.remove(reflvl)

for lvl in lvls:
new_contrast = batch + '+' + contrast[0] + '_' + lvl + '_' + reflvl if batch is not None else contrast[0] + '_' + lvl + '_' + reflvl
new_contrasts.append(new_contrast)
else:
# remove '~', for uniformity
new_contrast = original_contrast.replace("~", "").replace(" ", "")
reflvl = parsed_contrast[2]
lvls = [parsed_contrast[1]]

for lvl in lvls:
new_contrast = f"{column_name}_{lvl}_{reflvl}"
if batch:
new_contrast = f"{batch}+{new_contrast}"
new_contrasts.append(new_contrast)

# get unique elements
Expand Down Expand Up @@ -63,7 +55,7 @@ rule deseq2:
threads: 4
params:
samples=os.path.abspath(config["samples"]),
replicates=True if config['technical_replicates'] == 'merge' else False
replicates=True if 'replicate' in samples else False
resources:
R_scripts=1 # conda's R can have issues when starting multiple times
script:
Expand All @@ -87,7 +79,7 @@ rule blind_clustering:
threads: 4
params:
samples=os.path.abspath(config["samples"]),
replicates=True if config['technical_replicates'] == 'merge' else False
replicates=True if 'replicate' in samples else False
resources:
R_scripts=1 # conda's R can have issues when starting multiple times
script:
Expand Down
2 changes: 1 addition & 1 deletion seq2science/rules/alignment.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ def get_reads(wildcards):
"""
Function that returns the reads for any aligner.
"""
if 'replicate' in samples and config.get('technical_replicates') == 'merge':
if 'replicate' in samples:
if config['layout'].get(wildcards.sample, False) == "SINGLE":
return expand("{trimmed_dir}/merged/{{sample}}_trimmed.{fqsuffix}.gz", **config)
return sorted(expand("{trimmed_dir}/merged/{{sample}}_{fqext}_trimmed.{fqsuffix}.gz", **config))
Expand Down
10 changes: 5 additions & 5 deletions seq2science/rules/bam_cleaning.smk
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ rule mark_duplicates:
input:
get_bam_mark_duplicates
output:
bam= expand("{dedup_dir}/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.bam", **config),
metrics=expand("{qc_dir}/dedup/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.metrics.txt", **config)
bam= expand("{final_bam_dir}/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.bam", **config),
metrics=expand("{qc_dir}/markdup/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.metrics.txt", **config)
log:
expand("{log_dir}/mark_duplicates/{{assembly}}-{{sample}}-{{sorter}}-{{sorting}}.log", **config)
benchmark:
Expand Down Expand Up @@ -219,7 +219,7 @@ rule bam2cram:
bam=rules.mark_duplicates.output.bam,
assembly=expand("{genome_dir}/{{assembly}}/{{assembly}}.fa", **config)
output:
expand("{dedup_dir}/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.cram", **config),
expand("{final_bam_dir}/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.cram", **config),
log:
expand("{log_dir}/bam2cram/{{assembly}}-{{sample}}-{{sorter}}-{{sorting}}.log", **config)
benchmark:
Expand All @@ -238,9 +238,9 @@ rule samtools_index_cram:
Generate the index for a cram file.
"""
input:
expand("{dedup_dir}/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.cram", **config),
expand("{final_bam_dir}/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.cram", **config),
output:
expand("{dedup_dir}/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.cram.crai", **config),
expand("{final_bam_dir}/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.cram.crai", **config),
log:
expand("{log_dir}/samtools_index_cram/{{assembly}}-{{sample}}-{{sorter}}-{{sorting}}.log", **config)
benchmark:
Expand Down
17 changes: 8 additions & 9 deletions seq2science/rules/bigfiles.smk
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,10 @@ rule peak_bigpeak:

def get_strandedness(wildcards):
sample = f"{wildcards.sample}"
if 'replicate' in samples and config.get('technical_replicates') == 'merge':
s2 = samples[['replicate', 'strandedness']].drop_duplicates().set_index('replicate')
strandedness = s2["strandedness"].loc[sample]
else:
strandedness = samples["strandedness"].loc[sample]
s2 = samples
if 'replicate' in samples:
s2 = samples.reset_index()[['replicate', 'strandedness']].drop_duplicates().set_index('replicate')
strandedness = s2["strandedness"].loc[sample]
return strandedness


Expand All @@ -147,8 +146,8 @@ rule bam_stranded_bigwig:
Convert a bam file into two bigwig files, one for each strand
"""
input:
bam=expand("{dedup_dir}/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.bam", **config),
bai=expand("{dedup_dir}/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.bam.bai", **config)
bam=expand("{final_bam_dir}/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.bam", **config),
bai=expand("{final_bam_dir}/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.bam.bai", **config)
output:
forwards=temp(expand("{result_dir}/bigwigs/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.fwd.bw", **config)),
reverses=temp(expand("{result_dir}/bigwigs/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.rev.bw", **config))
Expand Down Expand Up @@ -184,8 +183,8 @@ rule bam_bigwig:
Convert a bam file into a bigwig file
"""
input:
bam=expand("{dedup_dir}/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.bam", **config),
bai=expand("{dedup_dir}/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.bam.bai", **config)
bam=expand("{final_bam_dir}/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.bam", **config),
bai=expand("{final_bam_dir}/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.bam.bai", **config)
output:
temp(expand("{result_dir}/bigwigs/{{assembly}}-{{sample}}.{{sorter}}-{{sorting}}.bw", **config))
params:
Expand Down
43 changes: 36 additions & 7 deletions seq2science/rules/configuration_generic.smk
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,31 @@ from snakemake.utils import validate
from snakemake.utils import min_version


logger.info(
"""\
____ ____ __
/ ___)( __) / \
\___ \ ) _) ( O )
(____/(____) \__\)
____
(___ \
/ __/
(____)
____ ___ __ ____ __ _ ___ ____
/ ___) / __) ( ) ( __)( ( \ / __)( __)
\___ \( (__ )( ) _) / /( (__ ) _)
(____/ \___) (__) (____)\_)__) \___)(____)
docs: https://vanheeringen-lab.github.io/seq2science
"""
)

if workflow.conda_frontend == "conda":
logger.info("NOTE: seq2science is using the conda frontend, for faster environment creation install mamba.")
# give people a second to appreciate this beautiful ascii art
time.sleep(1)


# config.yaml(s)


Expand Down Expand Up @@ -117,6 +142,10 @@ if 'descriptive_name' in samples:
if ('replicate' in samples and samples['descriptive_name'].to_list() == samples['replicate'].to_list()) or \
samples['descriptive_name'].to_list() == samples['sample'].to_list():
samples = samples.drop(columns=['descriptive_name'])
if 'strandedness' in samples:
samples['strandedness'] = samples['strandedness'].mask(pd.isnull, 'nan')
if config['filter_bam_by_strand'] is False or not any([field in list(samples['strandedness']) for field in ['forward', 'yes', 'reverse']]):
samples = samples.drop(columns=['strandedness'])

if 'replicate' in samples:
# check if replicate names are unique between assemblies
Expand All @@ -142,16 +171,15 @@ if "condition" in samples and "replicate" in samples:

# validate samples file
for schema in sample_schemas:
validate(samples, schema=f"../schemas/samples/{schema}.schema.yaml")
samples = samples.set_index('sample')
samples.index = samples.index.map(str)
validate(samples, schema=f"{config['rule_dir']}/../schemas/samples/{schema}.schema.yaml")

sanitized_samples = copy.copy(samples)


# sample layouts
samples = samples.set_index('sample')
samples.index = samples.index.map(str)


# sample layouts
# check if a sample is single-end or paired end, and store it
logger.info("Checking if samples are single-end or paired-end...")
layout_cachefile = os.path.expanduser('~/.config/snakemake/layouts.p')
Expand Down Expand Up @@ -290,7 +318,7 @@ config['layout'] = {**{key: value for key, value in config['layout'].items() if
logger.info("Done!\n\n")

# if samples are merged add the layout of the technical replicate to the config
if 'replicate' in samples and config.get('technical_replicates') == 'merge':
if 'replicate' in samples:
for sample in samples.index:
replicate = samples.loc[sample, 'replicate']
config['layout'][replicate] = config['layout'][sample]
Expand Down Expand Up @@ -380,8 +408,9 @@ else:
workflow.global_resources = {**{'mem_gb': np.clip(workflow.global_resources.get('mem_gb', 9999), 0, convert_size(mem.total, 3)[0])},
**workflow.global_resources}


dryrun=True
onstart:
dryrun=False
# save a copy of the latest samples and config file(s) in the log_dir
# skip this step on Jenkins, as it runs in parallel
if os.getcwd() != config['log_dir'] and not os.getcwd().startswith('/var/lib/jenkins'):
Expand Down
85 changes: 40 additions & 45 deletions seq2science/rules/configuration_workflows.smk
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,14 @@ if config.get('bam_sorter', False):

# make sure that our samples.tsv and configuration work together...
# ...on biological replicates
if 'condition' in samples and config.get('biological_replicates', '') != 'keep':
if 'condition' in samples:
if 'hmmratac' in config['peak_caller']:
assert config.get('biological_replicates', '') == 'idr', \
f'HMMRATAC peaks can only be combined through idr'

for condition in set(samples['condition']):
for assembly in set(samples[samples['condition'] == condition]['assembly']):
if 'replicate' in samples and config.get('technical_replicates') == 'merge':
if 'replicate' in samples:
nr_samples = len(set(samples[(samples['condition'] == condition) & (samples['assembly'] == assembly)]['replicate']))
else:
nr_samples = len(samples[(samples['condition'] == condition) & (samples['assembly'] == assembly)])
Expand All @@ -77,63 +77,58 @@ if 'condition' in samples and config.get('biological_replicates', '') != 'keep':
f' condition {condition} and assembly {assembly}'

# ...on DE contrasts
def parse_DE_contrasts(de_contrast):
"""
Extract batch and contrast groups from a DE contrast design
"""
original_contrast = de_contrast
if config.get('contrasts'):
def parse_de_contrasts(de_contrast):
"""
Extract contrast column, groups and batch effect column from a DE contrast design
Accepts a string containing a DESeq2 contrast design
# remove whitespaces (and '~'s if used)
de_contrast = de_contrast.replace(" ", "").replace("~", "")
Returns
parsed_contrast, lst: the contrast components
batch, str: the batch effect column or None
"""
# remove whitespaces (and '~'s if used)
de_contrast = de_contrast.replace(" ", "").replace("~", "")

# split and store batch effect
batch = None
if '+' in de_contrast:
batch = de_contrast.split('+')[0]
de_contrast = de_contrast.split('+')[1]
# split and store batch effect
batch = None
if '+' in de_contrast:
batch, de_contrast = de_contrast.split('+')[0:2]

# parse contrast
parsed_contrast = de_contrast.split('_')
return original_contrast, parsed_contrast, batch
# parse contrast column and groups
parsed_contrast = de_contrast.split('_')
return parsed_contrast, batch

if config.get('contrasts', False):
# check differential gene expression contrasts
old_contrasts = list(config["contrasts"])
for contrast in old_contrasts:
original_contrast, parsed_contrast, batch = parse_DE_contrasts(contrast)
for contrast in list(config["contrasts"]):
parsed_contrast, batch = parse_de_contrasts(contrast)
column_name = parsed_contrast[0]

# Check if the column names can be recognized in the contrast
assert parsed_contrast[0] in samples.columns and parsed_contrast[0] not in ["sample", "assembly"], \
(f'\nIn contrast design "{original_contrast}", "{parsed_contrast[0]} ' +
assert column_name in samples.columns and column_name not in ["sample", "assembly"], \
(f'\nIn contrast design "{contrast}", "{column_name} ' +
f'does not match any valid column name in {config["samples"]}.\n')
if batch is not None:
assert batch in samples.columns and batch not in ["sample", "assembly"], \
(f'\nIn contrast design "{original_contrast}", the batch effect "{batch}" ' +
(f'\nIn contrast design "{contrast}", the batch effect "{batch}" ' +
f'does not match any valid column name in {config["samples"]}.\n')

# Check if the groups described by the contrast can be identified and found in samples.tsv
l = len(parsed_contrast)
assert l < 4, ("\nA differential expression contrast couldn't be parsed correctly.\n" +
f"{str(l-1)} groups were found in '{original_contrast}' " +
# Check if the contrast contains the number of components we can parse
components = len(parsed_contrast)
assert components > 1, (f"\nA differential expression contrast is too short: '{contrast}'.\n"
"Specify at least one reference group to compare against.\n")
assert components < 4, ("\nA differential expression contrast couldn't be parsed correctly.\n" +
f"{str(components-1)} groups were found in '{contrast}' " +
f"(groups: {', '.join(parsed_contrast[1:])}).\n\n" +
f'Tip: do not use underscores in the columns of {config["samples"]} referenced by your contrast.\n')
if l == 1:
# check if contrast column has exactly 2 factor levels (per assembly)
tmp = samples[['assembly', parsed_contrast[0]]].dropna()
factors = pd.DataFrame(tmp.groupby('assembly')[parsed_contrast[0]].nunique())
assert all(factors[parsed_contrast[0]] == 2),\
('\nYour contrast design, ' + original_contrast +
', contains only a column name (' + parsed_contrast[0] +
'). \nIf you wish to compare all groups in this column, add a reference group. ' +
'Number of groups found (per assembly): \n' + str(factors[parsed_contrast[0]]))
else:
# check if contrast column contains the groups
for group in parsed_contrast[1:]:
if group != 'all':
assert str(group) in [str(i) for i in samples[parsed_contrast[0]].tolist()],\
('\nYour contrast design contains group ' + group +
' which cannot be found in column ' + parsed_contrast[0] +
' of ' + config["samples"] + '.\n')

# Check if the groups in the contrast can be found in the right column of the samples.tsv
for group in parsed_contrast[1:]:
if group != 'all':
assert str(group) in [str(i) for i in samples[column_name].tolist()],\
(f'\nYour contrast design contains group {group} which cannot be found '
f'in column {column_name} of {config["samples"]}.\n')


def sieve_bam(configdict):
Expand Down
Loading

0 comments on commit cf045b3

Please sign in to comment.