Merge pull request #39 from JLSteenwyk/efficient

Efficient
JLSteenwyk · Sep 19, 2024 · c44037d · c44037d
2 parents 0427f33 + 10b4db1
commit c44037d
Show file tree

Hide file tree

Showing 94 changed files with 1,318 additions and 1,482 deletions.
diff --git a/change_log.txt b/change_log.txt
@@ -1,5 +1,14 @@
 Major changes to PhyKIT are summarized here.
 
+1.21.0
+  - The partition file outputted from the create_concat function has been updated
+  - Column information in the partition file is as follows:
+    - column 1: alignment name
+    - column 2: # of taxa present
+    - column 3: # of taxa missing
+    - column 4: fraction of occupancy
+    - column 5: names of missing taxa (; separated)
+
 1.20.0
   - Fixed bug for thread_dna function when using a ClipKIT log file. Input protein alignment must be the untrimmed alignment.
 

diff --git a/docs/change_log/index.rst b/docs/change_log/index.rst
@@ -8,6 +8,14 @@ Change log
 
 Major changes to PhyKIT are summarized here.
 
+**1.21.0**:
+The partition file outputted from the create_concat function has been updated to the following format:
+- column 1: alignment name
+- column 2: # of taxa present
+- column 3: # of taxa missing
+- column 4: fraction of occupancy
+- column 5: names of missing taxa (; separated)
+
 **1.20.0**:
 Fixed bug for thread_dna function when using a ClipKIT log file. Input protein alignment must be the untrimmed alignment.
 

diff --git a/phykit/helpers/files.py b/phykit/helpers/files.py
@@ -1,7 +1,9 @@
 from enum import Enum
 import sys
+from typing import Tuple
 
 from Bio import AlignIO
+from Bio.Align import MultipleSeqAlignment
 
 
 class FileFormat(Enum):
@@ -15,12 +17,17 @@ class FileFormat(Enum):
     stockholm = "stockholm"
 
 
-def get_alignment_and_format(alignment_file_path: str):
-    # if file format is provided, read the file according to the user's file format
+def get_alignment_and_format(
+    alignment_file_path: str
+) -> Tuple[MultipleSeqAlignment, str, bool]:
+    # if file format is provided, read the file
+    # according to the user's file format
     for fileFormat in FileFormat:
         try:
-            alignment = AlignIO.read(open(alignment_file_path), fileFormat.value)
-            return alignment, fileFormat.value
+            alignment = AlignIO.read(
+                open(alignment_file_path), fileFormat.value
+            )
+            return alignment, fileFormat.value, is_protein_alignment(alignment)
         # the following exceptions refer to skipping over errors
         # associated with reading the wrong input file
         except ValueError:
@@ -33,6 +40,21 @@ def get_alignment_and_format(alignment_file_path: str):
             sys.exit()
 
 
+def is_protein_alignment(alignment: MultipleSeqAlignment) -> bool:
+    nucleotide_set = {
+        "A", "C", "G", "T", "U", "-", "N", "?", "*"
+    }
+
+    for record in alignment:
+        seq_set = set(record.seq.upper())
+        if seq_set - nucleotide_set:
+            # if there are chars that are not in the nucl set,
+            # it's likely a protein sequence
+            return True
+
+    return False
+
+
 def read_single_column_file_to_list(single_col_file_path: str) -> list:
     try:
         with open(single_col_file_path) as f:

diff --git a/phykit/phykit.py b/phykit/phykit.py
@@ -2563,6 +2563,11 @@ def create_concatenation_matrix(argv):
                 2) A partition file ready for input into RAxML or IQ-tree.
                 3) An occupancy file that summarizes the taxon occupancy
                    per sequence.
+                        - column 1: alignment name
+                        - column 2: # of taxa present
+                        - column 3: # of taxa missing
+                        - column 4: fraction of occupancy
+                        - column 5: names of missing taxa (; separated)
 
                 Aliases:
                   create_concatenation_matrix, create_concat, cc

diff --git a/phykit/services/alignment/alignment_length.py b/phykit/services/alignment/alignment_length.py
@@ -1,14 +1,16 @@
 from .base import Alignment
 
+from typing import Dict
+
 
 class AlignmentLength(Alignment):
     def __init__(self, args) -> None:
         super().__init__(**self.process_args(args))
 
-    def run(self):
-        alignment, _ = self.get_alignment_and_format()
+    def run(self) -> None:
+        alignment, _, _ = self.get_alignment_and_format()
         aln_len = alignment.get_alignment_length()
         print(aln_len)
 
-    def process_args(self, args):
+    def process_args(self, args) -> Dict[str, str]:
         return dict(alignment_file_path=args.alignment)
diff --git a/phykit/services/alignment/alignment_length_no_gaps.py b/phykit/services/alignment/alignment_length_no_gaps.py
@@ -1,40 +1,62 @@
+from argparse import Namespace
+from typing import Dict, Tuple
+
+from Bio.Align import MultipleSeqAlignment
+
 from .base import Alignment
 
 
 class AlignmentLengthNoGaps(Alignment):
     def __init__(self, args) -> None:
         super().__init__(**self.process_args(args))
 
-    def run(self):
-        alignment, alignment_format = self.get_alignment_and_format()
+    def run(self) -> None:
+        alignment, _, is_protein = self.get_alignment_and_format()
         (
             aln_len_no_gaps,
             aln_len,
             aln_len_no_gaps_per,
-        ) = self.calculate_alignment_length_no_gaps(alignment)
+        ) = self.calculate_alignment_length_no_gaps(alignment, is_protein)
         print(f"{aln_len_no_gaps}\t{aln_len}\t{round(aln_len_no_gaps_per, 4)}")
 
-    def process_args(self, args):
+    def process_args(
+        self,
+        args: Namespace,
+    ) -> Dict[str, str]:
         return dict(alignment_file_path=args.alignment)
 
-    def calculate_alignment_length_no_gaps(self, alignment):
+    def calculate_alignment_length_no_gaps(
+        self,
+        alignment: MultipleSeqAlignment,
+        is_protein: bool,
+    ) -> Tuple[int, int, float]:
         aln_len = alignment.get_alignment_length()
-        aln_len_no_gaps = self.get_sites_no_gaps_count(alignment, aln_len)
+        aln_len_no_gaps = self.get_sites_no_gaps_count(
+            alignment,
+            aln_len,
+            is_protein
+        )
 
-        # calculate percent of variable sites
         aln_len_no_gaps_per = (aln_len_no_gaps / aln_len) * 100
 
         return aln_len_no_gaps, aln_len, aln_len_no_gaps_per
 
-    def get_sites_no_gaps_count(self, alignment, aln_len):
+    def get_sites_no_gaps_count(
+        self,
+        alignment: MultipleSeqAlignment,
+        aln_len: int,
+        is_protein: bool,
+    ) -> int:
         """
         Count sites in the alignment with no gaps
         """
         aln_len_no_gaps = 0
-        for i in range(0, aln_len):
-            seq_at_position = ""
-            seq_at_position += alignment[:, i]
-            if "-" not in seq_at_position:
+
+        gap_chars = self.get_gap_chars()
+
+        for i in range(aln_len):
+            column = set(alignment[:, i])
+            if column.isdisjoint(gap_chars):
                 aln_len_no_gaps += 1
 
         return aln_len_no_gaps
diff --git a/phykit/services/alignment/alignment_recoding.py b/phykit/services/alignment/alignment_recoding.py
@@ -1,5 +1,8 @@
 from os import path
 import sys
+from typing import Dict, List
+
+from Bio.Align import MultipleSeqAlignment
 
 from .base import Alignment
 
@@ -10,34 +13,42 @@ class AlignmentRecoding(Alignment):
     def __init__(self, args) -> None:
         super().__init__(**self.process_args(args))
 
-    def run(self):
-        alignment, _ = self.get_alignment_and_format()
-        
+    def run(self) -> None:
+        alignment, _, is_protein = self.get_alignment_and_format()
+
         recoding_table = self.read_recoding_table(self.code[0])
 
-        recoded_alignment = self.recode_alignment_as_dict(
-            alignment, recoding_table
+        recoded_alignment = self.recode_alignment(
+            alignment, recoding_table, is_protein
         )
 
         for k, v in recoded_alignment.items():
             print(f">{k}\n{''.join(v)}")
 
-    def recode_alignment_as_dict(self, alignment, recoding_table: dict) -> dict:
+    def recode_alignment(
+        self,
+        alignment: MultipleSeqAlignment,
+        recoding_table: Dict[str, str],
+        is_protein: bool,
+    ) -> Dict[str, List[str]]:
+
+        gap_chars = self.get_gap_chars()
         recoded_alignment = dict()
-        for i in range(0, len(alignment)):
-            recoded_sequence_i = []
-            for j in range(alignment.get_alignment_length()):
-                sequence_ij = alignment[i, j].upper()
-                if sequence_ij in ["?", "-", "X"]:
-                    recoded_sequence_i.append(sequence_ij)
-                else:
-                    recoded_sequence_i.append(recoding_table[sequence_ij])
 
-            recoded_alignment[alignment[i].id] = recoded_sequence_i
+        for record in alignment:
+            recoded_sequence = [
+                recoding_table.get(base.upper(), base)
+                if base not in gap_chars else base
+                for base in record.seq
+            ]
+            recoded_alignment[record.id] = recoded_sequence
 
         return recoded_alignment
 
-    def read_recoding_table(self, recoding: str) -> dict:
+    def read_recoding_table(
+        self,
+        recoding: str
+    ) -> Dict[str, str]:
         """
         return translation table with codons as keys and amino acids as values
         """
@@ -47,33 +58,27 @@ def read_recoding_table(self, recoding: str) -> dict:
         if recoding is None:
             print("Please specify a recoding table")
             sys.exit()
-        elif recoding == "RY-nucleotide":
-            pathing = path.join(here, "../../recoding_tables/RY-nucleotide.txt")
-        elif recoding == "SandR-6":
-            pathing = path.join(here, "../../recoding_tables/S_and_R-6.txt")
-        elif recoding == "KGB-6":
-            pathing = path.join(here, "../../recoding_tables/KGB-6.txt")
-        elif recoding == "Dayhoff-6":
-            pathing = path.join(here, "../../recoding_tables/Dayhoff-6.txt")
-        elif recoding == "Dayhoff-9":
-            pathing = path.join(here, "../../recoding_tables/Dayhoff-9.txt")
-        elif recoding == "Dayhoff-12":
-            pathing = path.join(here, "../../recoding_tables/Dayhoff-12.txt")
-        elif recoding == "Dayhoff-15":
-            pathing = path.join(here, "../../recoding_tables/Dayhoff-15.txt")
-        elif recoding == "Dayhoff-18":
-            pathing = path.join(here, "../../recoding_tables/Dayhoff-18.txt")
-        # handling case of a custom translation table
-        else:
-            pathing = str(recoding)
-
-        with open(pathing) as code:
-            for line in code:
-                line = line.split()
-                if line[1].upper() in recoding_table.keys():
-                    recoding_table[line[1]].upper().append(line[0].upper())
-                else:
-                    recoding_table[line[1]] = line[0].upper()
+
+        recoding_paths = {
+            "RY-nucleotide": "../../recoding_tables/RY-nucleotide.txt",
+            "SandR-6": "../../recoding_tables/S_and_R-6.txt",
+            "KGB-6": "../../recoding_tables/KGB-6.txt",
+            "Dayhoff-6": "../../recoding_tables/Dayhoff-6.txt",
+            "Dayhoff-9": "../../recoding_tables/Dayhoff-9.txt",
+            "Dayhoff-12": "../../recoding_tables/Dayhoff-12.txt",
+            "Dayhoff-15": "../../recoding_tables/Dayhoff-15.txt",
+            "Dayhoff-18": "../../recoding_tables/Dayhoff-18.txt",
+        }
+        pathing = recoding_paths.get(recoding, str(recoding))
+
+        try:
+            with open(path.join(here, pathing)) as code:
+                for line in code:
+                    parts = line.split()
+                    recoding_table[parts[1].upper()] = parts[0].upper()
+        except FileNotFoundError:
+            print(f"Recoding table file '{pathing}' not found.")
+            sys.exit()
 
         return recoding_table
 

diff --git a/phykit/services/alignment/base.py b/phykit/services/alignment/base.py
@@ -1,5 +1,8 @@
+from collections import Counter
 import sys
 
+from typing import List
+
 from ..base import BaseService
 from ...helpers.files import (
     get_alignment_and_format as get_alignment_and_format_helper
@@ -49,47 +52,38 @@ def get_alignment_and_format(self):
             print("Please double check pathing and filenames")
             sys.exit()
 
-    def calculate_rcv(self):
-        alignment, _ = self.get_alignment_and_format()
+    def calculate_rcv(self) -> float:
+        alignment, _, _ = self.get_alignment_and_format()
         aln_len = alignment.get_alignment_length()
 
-        # string to hold all sequences
-        concat_seq = ""
-        # initialize a counter for the number of sequences in the input fasta file
-        num_records = 0
+        concat_seq = []
+        num_records = len(alignment)
 
-        # for each record join concatSeq string and sequence as well as keeping track
-        # of the number of records
-        for record in alignment:
-            concat_seq += record.seq
-            num_records += 1
+        concat_seq = "".join(str(record.seq) for record in alignment)
 
-        # dictionary to hold the average occurence of each sequence letter
-        average_d = {}
-        # loop through the different sequences that appear in the fasta file
-        # population dictionary with how many times that sequence appears
-        for seq in set(concat_seq):
-            average_d[seq] = concat_seq.count(seq) / num_records
+        total_counts = Counter(concat_seq)
+
+        average_d = {
+            seq: total_counts[seq] / num_records for seq in total_counts
+        }
 
-        # intiailize list to hold the RCV values per ith taxa
-        # that will later be summed
         indiv_rcv_values = []
 
-        # loop through records again and calculate RCV for
-        # each taxa and append to indivRCVvalues
         for record in alignment:
-            # temp holds a temporary value of the numerator before appending
-            # to numeratorRCVvalues and then is reassigned to 0 when it goes
-            # through the loop again
-            temp = 0
-            # calculates the absolute value of the ith sequence letter minus the average
-            for seq_letter in set(concat_seq):
-                temp += abs(
-                    record.seq.count(seq_letter) - average_d[seq_letter]
-                )
-            indiv_rcv_values.append(temp / (num_records * aln_len))
+            record_counts = Counter(record.seq)
+            temp_rcv = sum(
+                abs(
+                    record_counts[seq_letter] - average_d[seq_letter]
+                ) for seq_letter in total_counts
+            )
+            indiv_rcv_values.append(temp_rcv / (num_records * aln_len))
 
         relative_composition_variability = sum(indiv_rcv_values)
 
-        # print the sum of all RCV values
         return relative_composition_variability
+
+    def get_gap_chars(is_protein: bool) -> List[str]:
+        if is_protein:
+            return ["-", "?", "*", "X", "x"]
+        else:
+            return ["-", "?", "*", "X", "x", "N", "n"]