From f13447b910354dca3cc7c985875f04f9c921a7e1 Mon Sep 17 00:00:00 2001
From: JLSteenwyk <jlsteenwyk@gmail.com>
Date: Wed, 18 Sep 2024 17:09:37 -0700
Subject: [PATCH] added CPU support to variable sites function

---
 docs/usage/index.rst                          | 11 ++--
 phykit/phykit.py                              | 12 +++-
 .../alignment/parsimony_informative_sites.py  |  2 +-
 phykit/services/alignment/rcvt.py             | 30 ----------
 .../alignment/rename_fasta_entries.py         |  1 +
 .../services/alignment/sum_of_pairs_score.py  | 58 ++++++++++++++-----
 phykit/services/alignment/variable_sites.py   | 33 ++++++-----
 .../test_sum_of_pairs_score_integration.py    |  2 +-
 .../test_variable_sites_integration.py        |  7 +--
 .../services/alignment/test_variable_sites.py |  3 +-
 10 files changed, 87 insertions(+), 72 deletions(-)
diff --git a/docs/usage/index.rst b/docs/usage/index.rst
index 71d3fb9..e56a5ea 100644
--- a/docs/usage/index.rst
+++ b/docs/usage/index.rst
@@ -618,13 +618,13 @@ Acids Research (1999), doi: 10.1093/nar/27.13.2682.
 
 .. code-block:: shell
 
-	phykit sum_of_pairs_score <alignment> --reference <reference_alignment>
+	phykit sum_of_pairs_score <alignment> --reference <reference_alignment> [--cpu <cpu>]
 
 Options: |br|
 *<alignment>*: first argument after function name should be a query
 fasta alignment file to be scored for accuracy |br|
-*-r/\\-\\-reference*: reference alignment to compare the query alignment
-to
+*-r/\\-\\-reference*: reference alignment to compare the query alignment to |br|
+*\\-\\-cpu*: CPUs to use to accelerate calculation
 
 |
 
@@ -649,10 +649,11 @@ doi: 10.1093/gbe/evw179.
 
 .. code-block:: shell
 
-   phykit variable_sites <alignment>
+   phykit variable_sites <alignment> [--cpu <cpu>]
 
 Options: |br|
-*<alignment>*: first argument after function name should be an alignment file
+*<alignment>*: first argument after function name should be an alignment file |br|
+*\\-\\-cpu*: CPUs to use to accelerate calculation
 
 |
 
diff --git a/phykit/phykit.py b/phykit/phykit.py
index c12da21..b05b80d 100644
--- a/phykit/phykit.py
+++ b/phykit/phykit.py
@@ -1100,7 +1100,7 @@ def sum_of_pairs_score(argv):
                   pk_sum_of_pairs_score, pk_sops, pk_sop
 
                 Usage:
-                phykit sum_of_pairs_score <fasta> -r/--reference <ref.aln>
+                phykit sum_of_pairs_score <fasta> -r/--reference <ref.aln> [--cpu <cpu>]
 
                 Options
                 =====================================================
@@ -1111,11 +1111,15 @@ def sum_of_pairs_score(argv):
 
                 -r/--reference              reference fasta alignment to 
                                             compare query alignment to
+
+                --cpu                       CPUs to use to
+                                            accelerate calculation
                 """
             ),
         )
         parser.add_argument("fasta", type=str, help=SUPPRESS)
         parser.add_argument("-r", "--reference", type=str, help=SUPPRESS)
+        parser.add_argument("--cpu", type=int, help=SUPPRESS)
         args = parser.parse_args(argv)
         SumOfPairsScore(args).run()
 
@@ -1150,17 +1154,21 @@ def variable_sites(argv):
                   pk_variable_sites, pk_vs
 
                 Usage:
-                phykit variable_sites <alignment>
+                phykit variable_sites <alignment> [--cpu <cpu>]
 
                 Options
                 =====================================================
                 <alignment>                 first argument after 
                                             function name should be
                                             an alignment file          
+
+                --cpu                       CPUs to use to
+                                            accelerate calculation
                 """
             ),
         )
         parser.add_argument("alignment", type=str, help=SUPPRESS)
+        parser.add_argument("--cpu", type=int, help=SUPPRESS)
         args = parser.parse_args(argv)
         VariableSites(args).run()
 
diff --git a/phykit/services/alignment/parsimony_informative_sites.py b/phykit/services/alignment/parsimony_informative_sites.py
index eb288c4..2c42e53 100644
--- a/phykit/services/alignment/parsimony_informative_sites.py
+++ b/phykit/services/alignment/parsimony_informative_sites.py
@@ -19,7 +19,7 @@ def run(self):
         print(f"{pi_sites}\t{aln_len}\t{round(pi_sites_per, 4)}")
 
     def process_args(self, args) -> Dict[str, str]:
-        return dict(alignment_file_path=args.alignment)
+        return dict(alignment_file_path=args.alignment, cpu=args.cpu)
 
     def get_number_of_occurrences_per_character(
         self,
diff --git a/phykit/services/alignment/rcvt.py b/phykit/services/alignment/rcvt.py
index 9b8f26e..76da431 100644
--- a/phykit/services/alignment/rcvt.py
+++ b/phykit/services/alignment/rcvt.py
@@ -5,36 +5,6 @@
 from .base import Alignment
 
 
-# class RelativeCompositionVariabilityTaxon(Alignment):
-#     def __init__(self, args) -> None:
-#         super().__init__(**self.process_args(args))
-
-#     def run(self):
-#         alignment, _, _ = self.get_alignment_and_format()
-#         aln_len = alignment.get_alignment_length()
-#         num_records = len(alignment)
-
-#         concat_seq = "".join(str(record.seq) for record in alignment)
-#         total_counts = Counter(concat_seq)
-
-#         average_d = {
-#             char: total_counts[char] / num_records for char in total_counts
-#         }
-
-#         for record in alignment:
-#             record_counts = Counter(record.seq)
-#             temp_rcv = \
-#                 sum(
-#                     abs(
-#                         record_counts[seq_letter] - average_d[seq_letter]
-#                         ) for seq_letter in total_counts
-#                 )
-#             rcv_value = temp_rcv / (num_records * aln_len)
-#             print(f"{record.id}\t{round(rcv_value, 4)}")
-
-#     def process_args(self, args):
-#         return dict(alignment_file_path=args.alignment)
-
 class RelativeCompositionVariabilityTaxon(Alignment):
     def __init__(self, args) -> None:
         super().__init__(**self.process_args(args))
diff --git a/phykit/services/alignment/rename_fasta_entries.py b/phykit/services/alignment/rename_fasta_entries.py
index e5aa8e4..f39e9dc 100644
--- a/phykit/services/alignment/rename_fasta_entries.py
+++ b/phykit/services/alignment/rename_fasta_entries.py
@@ -30,6 +30,7 @@ def process_args(self, args) -> Dict[str, str]:
             fasta=args.fasta,
             idmap=args.idmap,
             output_file_path=output_file_path,
+            cpu=args.cpu
         )
 
     def load_idmap(self, idmap_file: str) -> Dict[str, str]:
diff --git a/phykit/services/alignment/sum_of_pairs_score.py b/phykit/services/alignment/sum_of_pairs_score.py
index 9b3e545..76fa15b 100644
--- a/phykit/services/alignment/sum_of_pairs_score.py
+++ b/phykit/services/alignment/sum_of_pairs_score.py
@@ -1,9 +1,9 @@
 import itertools
+from multiprocessing import Pool
 from typing import Dict, List, Tuple
 
 from Bio import SeqIO
 from Bio.SeqRecord import SeqRecord
-
 from .base import Alignment
 
 
@@ -27,7 +27,7 @@ def run(self):
         print(round(number_of_matches / number_of_total_pairs, 4))
 
     def process_args(self, args) -> Dict[str, str]:
-        return dict(fasta=args.fasta, reference=args.reference)
+        return dict(fasta=args.fasta, reference=args.reference, cpu=args.cpu)
 
     def determine_number_of_matches_and_total_pairs(
         self,
@@ -35,21 +35,49 @@ def determine_number_of_matches_and_total_pairs(
         reference_records: Dict[str, SeqRecord],
         query_records: Dict[str, SeqRecord],
     ) -> Tuple[int, int]:
-        print(query_records)
+        cpu = self.set_cpu()
+        with Pool(cpu) as pool:
+            results = pool.starmap(
+                self.compare_pair,
+                [
+                    (
+                        first_in_pair,
+                        second_in_pair,
+                        reference_records,
+                        query_records,
+                    )
+                    for first_in_pair, second_in_pair in record_id_pairs
+                ]
+            )
+
+        number_of_matches = sum(result[0] for result in results)
+        number_of_total_pairs = sum(result[1] for result in results)
+
+        return number_of_matches, number_of_total_pairs
+
+    def compare_pair(
+        self,
+        first_in_pair: str,
+        second_in_pair: str,
+        reference_records: Dict[str, SeqRecord],
+        query_records: Dict[str, SeqRecord],
+    ) -> Tuple[int, int]:
+        """
+        Compare a pair of sequences and return the number of matches and total pairs.
+        """
         number_of_matches = 0
         number_of_total_pairs = 0
 
-        for first_in_pair, second_in_pair in record_id_pairs:
-            ref_seq1 = reference_records[first_in_pair].seq
-            ref_seq2 = reference_records[second_in_pair].seq
-            query_seq1 = query_records[first_in_pair].seq
-            query_seq2 = query_records[second_in_pair].seq
-
-            for ref_res1, ref_res2, query_res1, query_res2 in zip(
-                ref_seq1, ref_seq2, query_seq1, query_seq2
-            ):
-                number_of_total_pairs += 1
-                if ref_res1 == query_res1 and ref_res2 == query_res2:
-                    number_of_matches += 1
+        ref_seq1 = reference_records[first_in_pair].seq
+        ref_seq2 = reference_records[second_in_pair].seq
+        query_seq1 = query_records[first_in_pair].seq
+        query_seq2 = query_records[second_in_pair].seq
+
+        for ref_res1, ref_res2, query_res1, query_res2 in zip(
+            ref_seq1, ref_seq2, query_seq1, query_seq2
+        ):
+            number_of_total_pairs += 1
+            if ref_res1 == query_res1 and ref_res2 == query_res2:
+                number_of_matches += 1
 
         return number_of_matches, number_of_total_pairs
diff --git a/phykit/services/alignment/variable_sites.py b/phykit/services/alignment/variable_sites.py
index 6ac3a16..41a4d85 100644
--- a/phykit/services/alignment/variable_sites.py
+++ b/phykit/services/alignment/variable_sites.py
@@ -1,3 +1,4 @@
+from multiprocessing import Pool
 from typing import Dict, Tuple
 
 from Bio.Align import MultipleSeqAlignment
@@ -10,35 +11,41 @@ def __init__(self, args) -> None:
         super().__init__(**self.process_args(args))
 
     def run(self):
-        alignment, _, is_protein = self.get_alignment_and_format()
+        alignment, _, _ = self.get_alignment_and_format()
         var_sites, aln_len, var_sites_per = \
             self.calculate_variable_sites(alignment)
 
         print(f"{var_sites}\t{aln_len}\t{round(var_sites_per, 4)}")
 
     def process_args(self, args) -> Dict[str, str]:
-        return dict(alignment_file_path=args.alignment)
+        return dict(alignment_file_path=args.alignment, cpu=args.cpu)
 
     def calculate_variable_sites(
         self,
         alignment: MultipleSeqAlignment
     ) -> Tuple[int, int, float]:
         aln_len = alignment.get_alignment_length()
-
         gap_chars = self.get_gap_chars()
 
-        var_sites = 0
-
-        for i in range(aln_len):
-            seq_at_position = [
-                residue.upper()
-                for residue in alignment[:, i]
-                if residue not in gap_chars
-            ]
+        cpu = self.set_cpu()
 
-            if len(set(seq_at_position)) > 1:
-                var_sites += 1
+        with Pool(cpu) as pool:
+            results = pool.map(
+                self.check_site_variability,
+                [(alignment[:, i], gap_chars) for i in range(aln_len)]
+            )
 
+        var_sites = sum(results)
         var_sites_per = (var_sites / aln_len) * 100
 
         return var_sites, aln_len, var_sites_per
+
+    def check_site_variability(self, args: Tuple[str, set]) -> int:
+        seq_at_position, gap_chars = args
+        seq_at_position = [
+            residue.upper()
+            for residue in seq_at_position
+            if residue not in gap_chars
+        ]
+
+        return 1 if len(set(seq_at_position)) > 1 else 0
diff --git a/tests/integration/alignment/test_sum_of_pairs_score_integration.py b/tests/integration/alignment/test_sum_of_pairs_score_integration.py
index 98f823e..5fc83a4 100644
--- a/tests/integration/alignment/test_sum_of_pairs_score_integration.py
+++ b/tests/integration/alignment/test_sum_of_pairs_score_integration.py
@@ -12,7 +12,7 @@
 class TestSumOfPairsScore(object):
     @patch("builtins.print")
     def test_sum_of_pairs_score_full_ref(self, mocked_print):
-        expected_result = 0.7714
+        expected_result = 0.4
         testargs = [
             "phykit",
             "sum_of_pairs_score",
diff --git a/tests/integration/alignment/test_variable_sites_integration.py b/tests/integration/alignment/test_variable_sites_integration.py
index e595dc7..6ff9d9c 100644
--- a/tests/integration/alignment/test_variable_sites_integration.py
+++ b/tests/integration/alignment/test_variable_sites_integration.py
@@ -1,8 +1,7 @@
-import pytest
-import sys
 from mock import patch, call
 from pathlib import Path
-from textwrap import dedent
+import pytest
+import sys
 
 from phykit.phykit import Phykit
 
@@ -70,4 +69,4 @@ def test_variable_sites_incorrect_input_file(self, mocked_print):
             Phykit()
 
         assert pytest_wrapped_e.type == SystemExit
-        assert pytest_wrapped_e.value.code == 2
\ No newline at end of file
+        assert pytest_wrapped_e.value.code == 2
diff --git a/tests/unit/services/alignment/test_variable_sites.py b/tests/unit/services/alignment/test_variable_sites.py
index ab10bb8..9d291d0 100644
--- a/tests/unit/services/alignment/test_variable_sites.py
+++ b/tests/unit/services/alignment/test_variable_sites.py
@@ -7,7 +7,7 @@
 
 @pytest.fixture
 def args():
-    kwargs = dict(alignment="/some/path/to/file.fa")
+    kwargs = dict(alignment="/some/path/to/file.fa", cpu=1)
     return Namespace(**kwargs)
 
 
@@ -15,6 +15,7 @@ class TestVariableSites(object):
     def test_init_sets_alignment_file_path(self, args):
         vs = VariableSites(args)
         assert vs.alignment_file_path == args.alignment
+        assert vs.cpu == (1,)
         assert vs.output_file_path is None
 
     def test_variable_sites(self, alignment_simple, args):