Merge pull request #1 from nidhi12k/master

Modifications to integrate decipher for MSA
wharvey31 · Apr 3, 2024 · 0bb60ef · 0bb60ef
2 parents 35cd03b + 1f042ad
commit 0bb60ef
Show file tree

Hide file tree

Showing 7 changed files with 61 additions and 20 deletions.
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -21,6 +21,7 @@ samples_df = pd.read_csv(
 )
 
 REF_FA = config["REF"]
+MSA_ALG = config.get("MSA_ALG", "mafft")
 
 
 include: "rules/common.smk"
@@ -35,6 +36,8 @@ val_methods = ["READS", "ASM", "SVPOP", "CALLABLE"]
 val_types = []
 
 for x in val_methods:
+    if x not in config:
+        continue
     for y in config[x]:
         val_types.append(y)
 

diff --git a/workflow/envs/decipher_env.yaml b/workflow/envs/decipher_env.yaml
@@ -0,0 +1,7 @@
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+dependencies:
+  - r-base=4.2.1
+  - bioconductor-decipher
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -458,7 +458,7 @@ def gather_callable_haps(wildcards):
         rules.callable_bed.output.tab,
         sample=wildcards.sample,
         val_type=wildcards.val_type,
-        hap=["hap1", "hap2"],
+        hap=["h1", "h2"],
         parent=[
             samples_df.at[wildcards.sample, "MO"],
             samples_df.at[wildcards.sample, "FA"],
@@ -488,6 +488,15 @@ def find_asm_aln(wildcards):
     return config.get("ASM")[wildcards.val_type]
 
 
+def find_aln_type(wildcards):
+    if MSA_ALG == "clustal":
+        return rules.clustalo.output.clust
+    elif MSA_ALG == "mafft":
+        return rules.mafft.output.clust
+    else:
+        print("Invalid MSA algorithm, must be either: clustal or mafft")
+        sys.exit(1)
+
 def find_int(wildcards):
     return expand(
         config.get("SVPOP")[wildcards.val_type],

diff --git a/workflow/rules/msa.smk b/workflow/rules/msa.smk
@@ -1,5 +1,6 @@
-
-
+import os
+SNAKEMAKE_DIR = os.path.dirname(workflow.snakefile)
+MSA_DIR=os.path.dirname(SNAKEMAKE_DIR)
 
 rule extract_seq:
     input:
@@ -36,29 +37,38 @@ rule rename:
                     else:
                         outfile.write(line)
 
-
-rule clustalo:
+rule make_multi_fa:
     input:
         fa=combine_fasta,
     output:
-        clust="temp/validation/ASM/{val_type}/{sample}/clustalo/{vartype}_{svtype}/{ids}/clustal.out",
+        multi_fa="temp/validation/ASM/{val_type}/{sample}/decipher/{vartype}_{svtype}/{ids}/seq.fa",
     resources:
         mem=lambda wildcards, attempt : 4**attempt,
         hrs=24,
     threads: 1
     shell:
-        """
-        if [[ $( grep ">" {input.fa} | wc -l ) < 2 ]]; then
-            touch {output.clust}
-        else
-            cat {input.fa} | clustalo -i - -o {output.clust}
-        fi
-        """
+        "mkdir -p $( echo {output.multi_fa} | sed 's/seq.fa//' );"
+        "cat {input.fa} > {output.multi_fa} ;"
+
 
+rule decipher_msa:
+    input:
+        fa=rules.make_multi_fa.output.multi_fa,
+    output:
+        html_out="temp/validation/ASM/{val_type}/{sample}/decipher/{vartype}_{svtype}/{ids}/decipher.html",
+        aln_out="temp/validation/ASM/{val_type}/{sample}/decipher/{vartype}_{svtype}/{ids}/decipher.out",
+    resources:
+        mem=lambda wildcards, attempt : 4**attempt,
+        hrs=24,
+    threads: 1
+    conda:
+        '../envs/decipher_env.yaml'
+    script:
+        "../scripts/decipher.R"
 
 rule process_align:
     input:
-        clust=rules.clustalo.output.clust,
+        decipher=rules.decipher_msa.output.aln_out,
     output:
         bed="temp/validation/ASM/{val_type}/{vartype}_{svtype}/{ids}/{sample}_{hap}_gap.bed",
     resources:
@@ -67,7 +77,7 @@ rule process_align:
     threads: 1
     run:
         record_dict = {}
-        with open(input.clust) as handle:
+        with open(input.decipher) as handle:
             for record in SeqIO.parse(handle, "fasta"):
                 text = str(record.seq)
                 hap = record.id

diff --git a/workflow/rules/svpop_int.smk b/workflow/rules/svpop_int.smk
@@ -10,7 +10,7 @@ rule int_check:
         hrs=24,
     threads: 1
     run:
-        val_dict = {"A": "VALID", "A,B": "NOTVALID"}
+        val_dict = {"A": "VALID", "AB": "NOTVALID"}
         val_df = pd.merge(
             pd.read_csv(
                 input.int_val[0], sep="\t", usecols=["ID_A", "SOURCE_SET"]
@@ -24,10 +24,10 @@ rule int_check:
         val_df = val_df.rename(columns={"ID_A": "ID"})
         all_df = pd.read_csv(input.bed, sep="\t").merge(val_df)
         val_df[f"SVPOP_{wildcards.val_type}_MO"] = val_df.apply(
-            lambda row: val_dict[row["SOURCE_SET_MO"]], axis=1
+            lambda row: val_dict[row["SOURCE_SET_MO"].replace(",", "")], axis=1
         )
         val_df[f"SVPOP_{wildcards.val_type}_FA"] = val_df.apply(
-            lambda row: val_dict[row["SOURCE_SET_FA"]], axis=1
+            lambda row: val_dict[row["SOURCE_SET_FA"].replace(",", "")], axis=1
         )
         val_df[
             ["ID", f"SVPOP_{wildcards.val_type}_MO", f"SVPOP_{wildcards.val_type}_FA"]

diff --git a/workflow/scripts/combine.py b/workflow/scripts/combine.py
@@ -12,7 +12,7 @@
         if i == 0:
             df = pd.read_csv(file, sep="\t")
         else:
-            df = df.merge(pd.read_csv(file, sep="\t"))
+            df = df.merge(pd.read_csv(file, sep="\t"), how="outer")
 
     df["DNSVAL"] = df.apply(
         lambda row: "PASS"
@@ -21,7 +21,7 @@
         axis=1,
     )
 
-    df[["ID", "DNSVAL"]].merge(bed_df).to_csv(
+    df.loc[df['DNSVAL'] == "PASS"][["ID", "DNSVAL"]].merge(bed_df).to_csv(
         snakemake.output.val, sep="\t", index=False
     )
     df.to_csv(snakemake.output.all_calls, sep="\t", index=False)
diff --git a/workflow/scripts/decipher.R b/workflow/scripts/decipher.R
@@ -0,0 +1,12 @@
+suppressMessages(library(DECIPHER))
+
+sample <- snakemake@wildcards[['sample']]
+multifasta <- snakemake@input[['fa']]
+html_out <- snakemake@output[['html_out']]
+aln_out <- snakemake@output[['aln_out']]
+
+seqs <- readDNAStringSet(multifasta)
+aligned <- AlignSeqs(seqs, iterations = 2, refinements = 1, verbose=FALSE)
+
+BrowseSeqs(aligned,htmlFile = html_out, openURL = FALSE,colWidth = 100)
+writeXStringSet(aligned,file=aln_out)