Skip to content

Commit

Permalink
reorganized and streamlined rcv and rcvt calculations
Browse files Browse the repository at this point in the history
  • Loading branch information
JLSteenwyk committed Sep 12, 2024
1 parent 50e3f0c commit efb57b2
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 76 deletions.
12 changes: 9 additions & 3 deletions phykit/helpers/files.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from enum import Enum
import sys
from typing import Tuple

from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
Expand All @@ -16,11 +17,16 @@ class FileFormat(Enum):
stockholm = "stockholm"


def get_alignment_and_format(alignment_file_path: str):
# if file format is provided, read the file according to the user's file format
def get_alignment_and_format(
alignment_file_path: str
) -> Tuple[MultipleSeqAlignment, str, bool]:
# if file format is provided, read the file
# according to the user's file format
for fileFormat in FileFormat:
try:
alignment = AlignIO.read(open(alignment_file_path), fileFormat.value)
alignment = AlignIO.read(
open(alignment_file_path), fileFormat.value
)
return alignment, fileFormat.value, is_protein_alignment(alignment)
# the following exceptions refer to skipping over errors
# associated with reading the wrong input file
Expand Down
50 changes: 18 additions & 32 deletions phykit/services/alignment/base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections import Counter
import sys

from typing import List
Expand Down Expand Up @@ -51,49 +52,34 @@ def get_alignment_and_format(self):
print("Please double check pathing and filenames")
sys.exit()

def calculate_rcv(self):
alignment, _ = self.get_alignment_and_format()
def calculate_rcv(self) -> float:
alignment, _, _ = self.get_alignment_and_format()
aln_len = alignment.get_alignment_length()

# string to hold all sequences
concat_seq = ""
# initialize a counter for the number of sequences in the input fasta file
num_records = 0
concat_seq = []
num_records = len(alignment)

# for each record join concatSeq string and sequence as well as keeping track
# of the number of records
for record in alignment:
concat_seq += record.seq
num_records += 1
concat_seq = "".join(str(record.seq) for record in alignment)

total_counts = Counter(concat_seq)

# dictionary to hold the average occurence of each sequence letter
average_d = {}
# loop through the different sequences that appear in the fasta file
# population dictionary with how many times that sequence appears
for seq in set(concat_seq):
average_d[seq] = concat_seq.count(seq) / num_records
average_d = {
seq: total_counts[seq] / num_records for seq in total_counts
}

# intiailize list to hold the RCV values per ith taxa
# that will later be summed
indiv_rcv_values = []

# loop through records again and calculate RCV for
# each taxa and append to indivRCVvalues
for record in alignment:
# temp holds a temporary value of the numerator before appending
# to numeratorRCVvalues and then is reassigned to 0 when it goes
# through the loop again
temp = 0
# calculates the absolute value of the ith sequence letter minus the average
for seq_letter in set(concat_seq):
temp += abs(
record.seq.count(seq_letter) - average_d[seq_letter]
)
indiv_rcv_values.append(temp / (num_records * aln_len))
record_counts = Counter(record.seq)
temp_rcv = sum(
abs(
record_counts[seq_letter] - average_d[seq_letter]
) for seq_letter in total_counts
)
indiv_rcv_values.append(temp_rcv / (num_records * aln_len))

relative_composition_variability = sum(indiv_rcv_values)

# print the sum of all RCV values
return relative_composition_variability

def get_gap_chars(is_protein: bool) -> List[str]:
Expand Down
57 changes: 18 additions & 39 deletions phykit/services/alignment/rcvt.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from collections import Counter

from .base import Alignment


Expand All @@ -6,50 +8,27 @@ def __init__(self, args) -> None:
super().__init__(**self.process_args(args))

def run(self):
alignment, _ = self.get_alignment_and_format()
alignment, _, _ = self.get_alignment_and_format()
aln_len = alignment.get_alignment_length()
num_records = len(alignment)

# string to hold all sequences
concat_seq = ""
# initialize a counter for the number of sequences in the input fasta file
num_records = 0
concat_seq = "".join(str(record.seq) for record in alignment)
total_counts = Counter(concat_seq)

average_d = {
char: total_counts[char] / num_records for char in total_counts
}

# for each record join concatSeq string and sequence as well as keeping track
# of the number of records
for record in alignment:
concat_seq += record.seq
num_records += 1

# dictionary to hold the average occurence of each sequence letter
average_d = {}
# loop through the different sequences that appear in the fasta file
# population dictionary with how many times that sequence appears
for seq in set(concat_seq):
average_d[seq] = concat_seq.count(seq) / num_records

# intiailize list to hold the RCV values per ith taxa
# that will later be summed
indiv_rcv_values = []

# loop through records again and calculate RCV for
# each taxa and append to indivRCVvalues
for record in alignment:
# temp holds a temporary value of the numerator before appending
# to numeratorRCVvalues and then is reassigned to 0 when it goes
# through the loop again
temp = 0
# calculates the absolute value of the ith sequence letter minus the average
for seq_letter in set(concat_seq):
temp += abs(
record.seq.count(seq_letter) - average_d[seq_letter]
record_counts = Counter(record.seq)
temp_rcv = \
sum(
abs(
record_counts[seq_letter] - average_d[seq_letter]
) for seq_letter in total_counts
)
indiv_rcv_values.append(temp / (num_records * aln_len))
print(f"{record.id}\t{round((temp / (num_records * aln_len)),4)}")

# relative_composition_variability = sum(indiv_rcv_values)

# print the sum of all RCV values
# print(round(relative_composition_variability, 4))
rcv_value = temp_rcv / (num_records * aln_len)
print(f"{record.id}\t{round(rcv_value, 4)}")

def process_args(self, args):
return dict(alignment_file_path=args.alignment)
3 changes: 1 addition & 2 deletions tests/integration/alignment/test_rcv_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import sys
from mock import patch, call
from pathlib import Path
from textwrap import dedent

from phykit.phykit import Phykit

Expand Down Expand Up @@ -70,4 +69,4 @@ def test_rcv_incorrect_input_file(self, mocked_print):
Phykit()

assert pytest_wrapped_e.type == SystemExit
assert pytest_wrapped_e.value.code == 2
assert pytest_wrapped_e.value.code == 2

0 comments on commit efb57b2

Please sign in to comment.