Skip to content

Commit

Permalink
Merge pull request #1 from nidhi12k/master
Browse files Browse the repository at this point in the history
Modifications to integrate decipher for MSA
  • Loading branch information
wharvey31 authored Apr 3, 2024
2 parents 35cd03b + 1f042ad commit 0bb60ef
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 20 deletions.
3 changes: 3 additions & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ samples_df = pd.read_csv(
)

REF_FA = config["REF"]
MSA_ALG = config.get("MSA_ALG", "mafft")


include: "rules/common.smk"
Expand All @@ -35,6 +36,8 @@ val_methods = ["READS", "ASM", "SVPOP", "CALLABLE"]
val_types = []

for x in val_methods:
if x not in config:
continue
for y in config[x]:
val_types.append(y)

Expand Down
7 changes: 7 additions & 0 deletions workflow/envs/decipher_env.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
channels:
- bioconda
- conda-forge
- defaults
dependencies:
- r-base=4.2.1
- bioconductor-decipher
11 changes: 10 additions & 1 deletion workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ def gather_callable_haps(wildcards):
rules.callable_bed.output.tab,
sample=wildcards.sample,
val_type=wildcards.val_type,
hap=["hap1", "hap2"],
hap=["h1", "h2"],
parent=[
samples_df.at[wildcards.sample, "MO"],
samples_df.at[wildcards.sample, "FA"],
Expand Down Expand Up @@ -488,6 +488,15 @@ def find_asm_aln(wildcards):
return config.get("ASM")[wildcards.val_type]


def find_aln_type(wildcards):
if MSA_ALG == "clustal":
return rules.clustalo.output.clust
elif MSA_ALG == "mafft":
return rules.mafft.output.clust
else:
print("Invalid MSA algorithm, must be either: clustal or mafft")
sys.exit(1)

def find_int(wildcards):
return expand(
config.get("SVPOP")[wildcards.val_type],
Expand Down
38 changes: 24 additions & 14 deletions workflow/rules/msa.smk
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@


import os
SNAKEMAKE_DIR = os.path.dirname(workflow.snakefile)
MSA_DIR=os.path.dirname(SNAKEMAKE_DIR)

rule extract_seq:
input:
Expand Down Expand Up @@ -36,29 +37,38 @@ rule rename:
else:
outfile.write(line)


rule clustalo:
rule make_multi_fa:
input:
fa=combine_fasta,
output:
clust="temp/validation/ASM/{val_type}/{sample}/clustalo/{vartype}_{svtype}/{ids}/clustal.out",
multi_fa="temp/validation/ASM/{val_type}/{sample}/decipher/{vartype}_{svtype}/{ids}/seq.fa",
resources:
mem=lambda wildcards, attempt : 4**attempt,
hrs=24,
threads: 1
shell:
"""
if [[ $( grep ">" {input.fa} | wc -l ) < 2 ]]; then
touch {output.clust}
else
cat {input.fa} | clustalo -i - -o {output.clust}
fi
"""
"mkdir -p $( echo {output.multi_fa} | sed 's/seq.fa//' );"
"cat {input.fa} > {output.multi_fa} ;"


rule decipher_msa:
input:
fa=rules.make_multi_fa.output.multi_fa,
output:
html_out="temp/validation/ASM/{val_type}/{sample}/decipher/{vartype}_{svtype}/{ids}/decipher.html",
aln_out="temp/validation/ASM/{val_type}/{sample}/decipher/{vartype}_{svtype}/{ids}/decipher.out",
resources:
mem=lambda wildcards, attempt : 4**attempt,
hrs=24,
threads: 1
conda:
'../envs/decipher_env.yaml'
script:
"../scripts/decipher.R"

rule process_align:
input:
clust=rules.clustalo.output.clust,
decipher=rules.decipher_msa.output.aln_out,
output:
bed="temp/validation/ASM/{val_type}/{vartype}_{svtype}/{ids}/{sample}_{hap}_gap.bed",
resources:
Expand All @@ -67,7 +77,7 @@ rule process_align:
threads: 1
run:
record_dict = {}
with open(input.clust) as handle:
with open(input.decipher) as handle:
for record in SeqIO.parse(handle, "fasta"):
text = str(record.seq)
hap = record.id
Expand Down
6 changes: 3 additions & 3 deletions workflow/rules/svpop_int.smk
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ rule int_check:
hrs=24,
threads: 1
run:
val_dict = {"A": "VALID", "A,B": "NOTVALID"}
val_dict = {"A": "VALID", "AB": "NOTVALID"}
val_df = pd.merge(
pd.read_csv(
input.int_val[0], sep="\t", usecols=["ID_A", "SOURCE_SET"]
Expand All @@ -24,10 +24,10 @@ rule int_check:
val_df = val_df.rename(columns={"ID_A": "ID"})
all_df = pd.read_csv(input.bed, sep="\t").merge(val_df)
val_df[f"SVPOP_{wildcards.val_type}_MO"] = val_df.apply(
lambda row: val_dict[row["SOURCE_SET_MO"]], axis=1
lambda row: val_dict[row["SOURCE_SET_MO"].replace(",", "")], axis=1
)
val_df[f"SVPOP_{wildcards.val_type}_FA"] = val_df.apply(
lambda row: val_dict[row["SOURCE_SET_FA"]], axis=1
lambda row: val_dict[row["SOURCE_SET_FA"].replace(",", "")], axis=1
)
val_df[
["ID", f"SVPOP_{wildcards.val_type}_MO", f"SVPOP_{wildcards.val_type}_FA"]
Expand Down
4 changes: 2 additions & 2 deletions workflow/scripts/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
if i == 0:
df = pd.read_csv(file, sep="\t")
else:
df = df.merge(pd.read_csv(file, sep="\t"))
df = df.merge(pd.read_csv(file, sep="\t"), how="outer")

df["DNSVAL"] = df.apply(
lambda row: "PASS"
Expand All @@ -21,7 +21,7 @@
axis=1,
)

df[["ID", "DNSVAL"]].merge(bed_df).to_csv(
df.loc[df['DNSVAL'] == "PASS"][["ID", "DNSVAL"]].merge(bed_df).to_csv(
snakemake.output.val, sep="\t", index=False
)
df.to_csv(snakemake.output.all_calls, sep="\t", index=False)
12 changes: 12 additions & 0 deletions workflow/scripts/decipher.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
suppressMessages(library(DECIPHER))

sample <- snakemake@wildcards[['sample']]
multifasta <- snakemake@input[['fa']]
html_out <- snakemake@output[['html_out']]
aln_out <- snakemake@output[['aln_out']]

seqs <- readDNAStringSet(multifasta)
aligned <- AlignSeqs(seqs, iterations = 2, refinements = 1, verbose=FALSE)

BrowseSeqs(aligned,htmlFile = html_out, openURL = FALSE,colWidth = 100)
writeXStringSet(aligned,file=aln_out)

0 comments on commit 0bb60ef

Please sign in to comment.