Skip to content

Commit

Permalink
Merge pull request #39 from JLSteenwyk/efficient
Browse files Browse the repository at this point in the history
Efficient
  • Loading branch information
JLSteenwyk authored Sep 19, 2024
2 parents 0427f33 + 10b4db1 commit c44037d
Show file tree
Hide file tree
Showing 94 changed files with 1,318 additions and 1,482 deletions.
9 changes: 9 additions & 0 deletions change_log.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
Major changes to PhyKIT are summarized here.

1.21.0
- The partition file outputted from the create_concat function has been updated
- Column information in the partition file is as follows:
- column 1: alignment name
- column 2: # of taxa present
- column 3: # of taxa missing
- column 4: fraction of occupancy
- column 5: names of missing taxa (; separated)

1.20.0
- Fixed bug for thread_dna function when using a ClipKIT log file. Input protein alignment must be the untrimmed alignment.

Expand Down
8 changes: 8 additions & 0 deletions docs/change_log/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@ Change log

Major changes to PhyKIT are summarized here.

**1.21.0**:
The partition file outputted from the create_concat function has been updated to the following format:
- column 1: alignment name
- column 2: # of taxa present
- column 3: # of taxa missing
- column 4: fraction of occupancy
- column 5: names of missing taxa (; separated)

**1.20.0**:
Fixed bug for thread_dna function when using a ClipKIT log file. Input protein alignment must be the untrimmed alignment.

Expand Down
30 changes: 26 additions & 4 deletions phykit/helpers/files.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from enum import Enum
import sys
from typing import Tuple

from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment


class FileFormat(Enum):
Expand All @@ -15,12 +17,17 @@ class FileFormat(Enum):
stockholm = "stockholm"


def get_alignment_and_format(alignment_file_path: str):
# if file format is provided, read the file according to the user's file format
def get_alignment_and_format(
alignment_file_path: str
) -> Tuple[MultipleSeqAlignment, str, bool]:
# if file format is provided, read the file
# according to the user's file format
for fileFormat in FileFormat:
try:
alignment = AlignIO.read(open(alignment_file_path), fileFormat.value)
return alignment, fileFormat.value
alignment = AlignIO.read(
open(alignment_file_path), fileFormat.value
)
return alignment, fileFormat.value, is_protein_alignment(alignment)
# the following exceptions refer to skipping over errors
# associated with reading the wrong input file
except ValueError:
Expand All @@ -33,6 +40,21 @@ def get_alignment_and_format(alignment_file_path: str):
sys.exit()


def is_protein_alignment(alignment: MultipleSeqAlignment) -> bool:
nucleotide_set = {
"A", "C", "G", "T", "U", "-", "N", "?", "*"
}

for record in alignment:
seq_set = set(record.seq.upper())
if seq_set - nucleotide_set:
# if there are chars that are not in the nucl set,
# it's likely a protein sequence
return True

return False


def read_single_column_file_to_list(single_col_file_path: str) -> list:
try:
with open(single_col_file_path) as f:
Expand Down
5 changes: 5 additions & 0 deletions phykit/phykit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2563,6 +2563,11 @@ def create_concatenation_matrix(argv):
2) A partition file ready for input into RAxML or IQ-tree.
3) An occupancy file that summarizes the taxon occupancy
per sequence.
- column 1: alignment name
- column 2: # of taxa present
- column 3: # of taxa missing
- column 4: fraction of occupancy
- column 5: names of missing taxa (; separated)
Aliases:
create_concatenation_matrix, create_concat, cc
Expand Down
8 changes: 5 additions & 3 deletions phykit/services/alignment/alignment_length.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from .base import Alignment

from typing import Dict


class AlignmentLength(Alignment):
def __init__(self, args) -> None:
super().__init__(**self.process_args(args))

def run(self):
alignment, _ = self.get_alignment_and_format()
def run(self) -> None:
alignment, _, _ = self.get_alignment_and_format()
aln_len = alignment.get_alignment_length()
print(aln_len)

def process_args(self, args):
def process_args(self, args) -> Dict[str, str]:
return dict(alignment_file_path=args.alignment)
46 changes: 34 additions & 12 deletions phykit/services/alignment/alignment_length_no_gaps.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,62 @@
from argparse import Namespace
from typing import Dict, Tuple

from Bio.Align import MultipleSeqAlignment

from .base import Alignment


class AlignmentLengthNoGaps(Alignment):
def __init__(self, args) -> None:
super().__init__(**self.process_args(args))

def run(self):
alignment, alignment_format = self.get_alignment_and_format()
def run(self) -> None:
alignment, _, is_protein = self.get_alignment_and_format()
(
aln_len_no_gaps,
aln_len,
aln_len_no_gaps_per,
) = self.calculate_alignment_length_no_gaps(alignment)
) = self.calculate_alignment_length_no_gaps(alignment, is_protein)
print(f"{aln_len_no_gaps}\t{aln_len}\t{round(aln_len_no_gaps_per, 4)}")

def process_args(self, args):
def process_args(
self,
args: Namespace,
) -> Dict[str, str]:
return dict(alignment_file_path=args.alignment)

def calculate_alignment_length_no_gaps(self, alignment):
def calculate_alignment_length_no_gaps(
self,
alignment: MultipleSeqAlignment,
is_protein: bool,
) -> Tuple[int, int, float]:
aln_len = alignment.get_alignment_length()
aln_len_no_gaps = self.get_sites_no_gaps_count(alignment, aln_len)
aln_len_no_gaps = self.get_sites_no_gaps_count(
alignment,
aln_len,
is_protein
)

# calculate percent of variable sites
aln_len_no_gaps_per = (aln_len_no_gaps / aln_len) * 100

return aln_len_no_gaps, aln_len, aln_len_no_gaps_per

def get_sites_no_gaps_count(self, alignment, aln_len):
def get_sites_no_gaps_count(
self,
alignment: MultipleSeqAlignment,
aln_len: int,
is_protein: bool,
) -> int:
"""
Count sites in the alignment with no gaps
"""
aln_len_no_gaps = 0
for i in range(0, aln_len):
seq_at_position = ""
seq_at_position += alignment[:, i]
if "-" not in seq_at_position:

gap_chars = self.get_gap_chars()

for i in range(aln_len):
column = set(alignment[:, i])
if column.isdisjoint(gap_chars):
aln_len_no_gaps += 1

return aln_len_no_gaps
91 changes: 48 additions & 43 deletions phykit/services/alignment/alignment_recoding.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from os import path
import sys
from typing import Dict, List

from Bio.Align import MultipleSeqAlignment

from .base import Alignment

Expand All @@ -10,34 +13,42 @@ class AlignmentRecoding(Alignment):
def __init__(self, args) -> None:
super().__init__(**self.process_args(args))

def run(self):
alignment, _ = self.get_alignment_and_format()
def run(self) -> None:
alignment, _, is_protein = self.get_alignment_and_format()

recoding_table = self.read_recoding_table(self.code[0])

recoded_alignment = self.recode_alignment_as_dict(
alignment, recoding_table
recoded_alignment = self.recode_alignment(
alignment, recoding_table, is_protein
)

for k, v in recoded_alignment.items():
print(f">{k}\n{''.join(v)}")

def recode_alignment_as_dict(self, alignment, recoding_table: dict) -> dict:
def recode_alignment(
self,
alignment: MultipleSeqAlignment,
recoding_table: Dict[str, str],
is_protein: bool,
) -> Dict[str, List[str]]:

gap_chars = self.get_gap_chars()
recoded_alignment = dict()
for i in range(0, len(alignment)):
recoded_sequence_i = []
for j in range(alignment.get_alignment_length()):
sequence_ij = alignment[i, j].upper()
if sequence_ij in ["?", "-", "X"]:
recoded_sequence_i.append(sequence_ij)
else:
recoded_sequence_i.append(recoding_table[sequence_ij])

recoded_alignment[alignment[i].id] = recoded_sequence_i
for record in alignment:
recoded_sequence = [
recoding_table.get(base.upper(), base)
if base not in gap_chars else base
for base in record.seq
]
recoded_alignment[record.id] = recoded_sequence

return recoded_alignment

def read_recoding_table(self, recoding: str) -> dict:
def read_recoding_table(
self,
recoding: str
) -> Dict[str, str]:
"""
return translation table with codons as keys and amino acids as values
"""
Expand All @@ -47,33 +58,27 @@ def read_recoding_table(self, recoding: str) -> dict:
if recoding is None:
print("Please specify a recoding table")
sys.exit()
elif recoding == "RY-nucleotide":
pathing = path.join(here, "../../recoding_tables/RY-nucleotide.txt")
elif recoding == "SandR-6":
pathing = path.join(here, "../../recoding_tables/S_and_R-6.txt")
elif recoding == "KGB-6":
pathing = path.join(here, "../../recoding_tables/KGB-6.txt")
elif recoding == "Dayhoff-6":
pathing = path.join(here, "../../recoding_tables/Dayhoff-6.txt")
elif recoding == "Dayhoff-9":
pathing = path.join(here, "../../recoding_tables/Dayhoff-9.txt")
elif recoding == "Dayhoff-12":
pathing = path.join(here, "../../recoding_tables/Dayhoff-12.txt")
elif recoding == "Dayhoff-15":
pathing = path.join(here, "../../recoding_tables/Dayhoff-15.txt")
elif recoding == "Dayhoff-18":
pathing = path.join(here, "../../recoding_tables/Dayhoff-18.txt")
# handling case of a custom translation table
else:
pathing = str(recoding)

with open(pathing) as code:
for line in code:
line = line.split()
if line[1].upper() in recoding_table.keys():
recoding_table[line[1]].upper().append(line[0].upper())
else:
recoding_table[line[1]] = line[0].upper()

recoding_paths = {
"RY-nucleotide": "../../recoding_tables/RY-nucleotide.txt",
"SandR-6": "../../recoding_tables/S_and_R-6.txt",
"KGB-6": "../../recoding_tables/KGB-6.txt",
"Dayhoff-6": "../../recoding_tables/Dayhoff-6.txt",
"Dayhoff-9": "../../recoding_tables/Dayhoff-9.txt",
"Dayhoff-12": "../../recoding_tables/Dayhoff-12.txt",
"Dayhoff-15": "../../recoding_tables/Dayhoff-15.txt",
"Dayhoff-18": "../../recoding_tables/Dayhoff-18.txt",
}
pathing = recoding_paths.get(recoding, str(recoding))

try:
with open(path.join(here, pathing)) as code:
for line in code:
parts = line.split()
recoding_table[parts[1].upper()] = parts[0].upper()
except FileNotFoundError:
print(f"Recoding table file '{pathing}' not found.")
sys.exit()

return recoding_table

Expand Down
58 changes: 26 additions & 32 deletions phykit/services/alignment/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from collections import Counter
import sys

from typing import List

from ..base import BaseService
from ...helpers.files import (
get_alignment_and_format as get_alignment_and_format_helper
Expand Down Expand Up @@ -49,47 +52,38 @@ def get_alignment_and_format(self):
print("Please double check pathing and filenames")
sys.exit()

def calculate_rcv(self):
alignment, _ = self.get_alignment_and_format()
def calculate_rcv(self) -> float:
alignment, _, _ = self.get_alignment_and_format()
aln_len = alignment.get_alignment_length()

# string to hold all sequences
concat_seq = ""
# initialize a counter for the number of sequences in the input fasta file
num_records = 0
concat_seq = []
num_records = len(alignment)

# for each record join concatSeq string and sequence as well as keeping track
# of the number of records
for record in alignment:
concat_seq += record.seq
num_records += 1
concat_seq = "".join(str(record.seq) for record in alignment)

# dictionary to hold the average occurence of each sequence letter
average_d = {}
# loop through the different sequences that appear in the fasta file
# population dictionary with how many times that sequence appears
for seq in set(concat_seq):
average_d[seq] = concat_seq.count(seq) / num_records
total_counts = Counter(concat_seq)

average_d = {
seq: total_counts[seq] / num_records for seq in total_counts
}

# intiailize list to hold the RCV values per ith taxa
# that will later be summed
indiv_rcv_values = []

# loop through records again and calculate RCV for
# each taxa and append to indivRCVvalues
for record in alignment:
# temp holds a temporary value of the numerator before appending
# to numeratorRCVvalues and then is reassigned to 0 when it goes
# through the loop again
temp = 0
# calculates the absolute value of the ith sequence letter minus the average
for seq_letter in set(concat_seq):
temp += abs(
record.seq.count(seq_letter) - average_d[seq_letter]
)
indiv_rcv_values.append(temp / (num_records * aln_len))
record_counts = Counter(record.seq)
temp_rcv = sum(
abs(
record_counts[seq_letter] - average_d[seq_letter]
) for seq_letter in total_counts
)
indiv_rcv_values.append(temp_rcv / (num_records * aln_len))

relative_composition_variability = sum(indiv_rcv_values)

# print the sum of all RCV values
return relative_composition_variability

def get_gap_chars(is_protein: bool) -> List[str]:
if is_protein:
return ["-", "?", "*", "X", "x"]
else:
return ["-", "?", "*", "X", "x", "N", "n"]
Loading

0 comments on commit c44037d

Please sign in to comment.