From f13447b910354dca3cc7c985875f04f9c921a7e1 Mon Sep 17 00:00:00 2001 From: JLSteenwyk Date: Wed, 18 Sep 2024 17:09:37 -0700 Subject: [PATCH] added CPU support to variable sites function --- docs/usage/index.rst | 11 ++-- phykit/phykit.py | 12 +++- .../alignment/parsimony_informative_sites.py | 2 +- phykit/services/alignment/rcvt.py | 30 ---------- .../alignment/rename_fasta_entries.py | 1 + .../services/alignment/sum_of_pairs_score.py | 58 ++++++++++++++----- phykit/services/alignment/variable_sites.py | 33 ++++++----- .../test_sum_of_pairs_score_integration.py | 2 +- .../test_variable_sites_integration.py | 7 +-- .../services/alignment/test_variable_sites.py | 3 +- 10 files changed, 87 insertions(+), 72 deletions(-) diff --git a/docs/usage/index.rst b/docs/usage/index.rst index 71d3fb9..e56a5ea 100644 --- a/docs/usage/index.rst +++ b/docs/usage/index.rst @@ -618,13 +618,13 @@ Acids Research (1999), doi: 10.1093/nar/27.13.2682. .. code-block:: shell - phykit sum_of_pairs_score --reference + phykit sum_of_pairs_score --reference [--cpu ] Options: |br| **: first argument after function name should be a query fasta alignment file to be scored for accuracy |br| -*-r/\\-\\-reference*: reference alignment to compare the query alignment -to +*-r/\\-\\-reference*: reference alignment to compare the query alignment to |br| +*\\-\\-cpu*: CPUs to use to accelerate calculation | @@ -649,10 +649,11 @@ doi: 10.1093/gbe/evw179. .. code-block:: shell - phykit variable_sites + phykit variable_sites [--cpu ] Options: |br| -**: first argument after function name should be an alignment file +**: first argument after function name should be an alignment file |br| +*\\-\\-cpu*: CPUs to use to accelerate calculation | diff --git a/phykit/phykit.py b/phykit/phykit.py index c12da21..b05b80d 100644 --- a/phykit/phykit.py +++ b/phykit/phykit.py @@ -1100,7 +1100,7 @@ def sum_of_pairs_score(argv): pk_sum_of_pairs_score, pk_sops, pk_sop Usage: - phykit sum_of_pairs_score -r/--reference + phykit sum_of_pairs_score -r/--reference [--cpu ] Options ===================================================== @@ -1111,11 +1111,15 @@ def sum_of_pairs_score(argv): -r/--reference reference fasta alignment to compare query alignment to + + --cpu CPUs to use to + accelerate calculation """ ), ) parser.add_argument("fasta", type=str, help=SUPPRESS) parser.add_argument("-r", "--reference", type=str, help=SUPPRESS) + parser.add_argument("--cpu", type=int, help=SUPPRESS) args = parser.parse_args(argv) SumOfPairsScore(args).run() @@ -1150,17 +1154,21 @@ def variable_sites(argv): pk_variable_sites, pk_vs Usage: - phykit variable_sites + phykit variable_sites [--cpu ] Options ===================================================== first argument after function name should be an alignment file + + --cpu CPUs to use to + accelerate calculation """ ), ) parser.add_argument("alignment", type=str, help=SUPPRESS) + parser.add_argument("--cpu", type=int, help=SUPPRESS) args = parser.parse_args(argv) VariableSites(args).run() diff --git a/phykit/services/alignment/parsimony_informative_sites.py b/phykit/services/alignment/parsimony_informative_sites.py index eb288c4..2c42e53 100644 --- a/phykit/services/alignment/parsimony_informative_sites.py +++ b/phykit/services/alignment/parsimony_informative_sites.py @@ -19,7 +19,7 @@ def run(self): print(f"{pi_sites}\t{aln_len}\t{round(pi_sites_per, 4)}") def process_args(self, args) -> Dict[str, str]: - return dict(alignment_file_path=args.alignment) + return dict(alignment_file_path=args.alignment, cpu=args.cpu) def get_number_of_occurrences_per_character( self, diff --git a/phykit/services/alignment/rcvt.py b/phykit/services/alignment/rcvt.py index 9b8f26e..76da431 100644 --- a/phykit/services/alignment/rcvt.py +++ b/phykit/services/alignment/rcvt.py @@ -5,36 +5,6 @@ from .base import Alignment -# class RelativeCompositionVariabilityTaxon(Alignment): -# def __init__(self, args) -> None: -# super().__init__(**self.process_args(args)) - -# def run(self): -# alignment, _, _ = self.get_alignment_and_format() -# aln_len = alignment.get_alignment_length() -# num_records = len(alignment) - -# concat_seq = "".join(str(record.seq) for record in alignment) -# total_counts = Counter(concat_seq) - -# average_d = { -# char: total_counts[char] / num_records for char in total_counts -# } - -# for record in alignment: -# record_counts = Counter(record.seq) -# temp_rcv = \ -# sum( -# abs( -# record_counts[seq_letter] - average_d[seq_letter] -# ) for seq_letter in total_counts -# ) -# rcv_value = temp_rcv / (num_records * aln_len) -# print(f"{record.id}\t{round(rcv_value, 4)}") - -# def process_args(self, args): -# return dict(alignment_file_path=args.alignment) - class RelativeCompositionVariabilityTaxon(Alignment): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) diff --git a/phykit/services/alignment/rename_fasta_entries.py b/phykit/services/alignment/rename_fasta_entries.py index e5aa8e4..f39e9dc 100644 --- a/phykit/services/alignment/rename_fasta_entries.py +++ b/phykit/services/alignment/rename_fasta_entries.py @@ -30,6 +30,7 @@ def process_args(self, args) -> Dict[str, str]: fasta=args.fasta, idmap=args.idmap, output_file_path=output_file_path, + cpu=args.cpu ) def load_idmap(self, idmap_file: str) -> Dict[str, str]: diff --git a/phykit/services/alignment/sum_of_pairs_score.py b/phykit/services/alignment/sum_of_pairs_score.py index 9b3e545..76fa15b 100644 --- a/phykit/services/alignment/sum_of_pairs_score.py +++ b/phykit/services/alignment/sum_of_pairs_score.py @@ -1,9 +1,9 @@ import itertools +from multiprocessing import Pool from typing import Dict, List, Tuple from Bio import SeqIO from Bio.SeqRecord import SeqRecord - from .base import Alignment @@ -27,7 +27,7 @@ def run(self): print(round(number_of_matches / number_of_total_pairs, 4)) def process_args(self, args) -> Dict[str, str]: - return dict(fasta=args.fasta, reference=args.reference) + return dict(fasta=args.fasta, reference=args.reference, cpu=args.cpu) def determine_number_of_matches_and_total_pairs( self, @@ -35,21 +35,49 @@ def determine_number_of_matches_and_total_pairs( reference_records: Dict[str, SeqRecord], query_records: Dict[str, SeqRecord], ) -> Tuple[int, int]: - print(query_records) + cpu = self.set_cpu() + with Pool(cpu) as pool: + results = pool.starmap( + self.compare_pair, + [ + ( + first_in_pair, + second_in_pair, + reference_records, + query_records, + ) + for first_in_pair, second_in_pair in record_id_pairs + ] + ) + + number_of_matches = sum(result[0] for result in results) + number_of_total_pairs = sum(result[1] for result in results) + + return number_of_matches, number_of_total_pairs + + def compare_pair( + self, + first_in_pair: str, + second_in_pair: str, + reference_records: Dict[str, SeqRecord], + query_records: Dict[str, SeqRecord], + ) -> Tuple[int, int]: + """ + Compare a pair of sequences and return the number of matches and total pairs. + """ number_of_matches = 0 number_of_total_pairs = 0 - for first_in_pair, second_in_pair in record_id_pairs: - ref_seq1 = reference_records[first_in_pair].seq - ref_seq2 = reference_records[second_in_pair].seq - query_seq1 = query_records[first_in_pair].seq - query_seq2 = query_records[second_in_pair].seq - - for ref_res1, ref_res2, query_res1, query_res2 in zip( - ref_seq1, ref_seq2, query_seq1, query_seq2 - ): - number_of_total_pairs += 1 - if ref_res1 == query_res1 and ref_res2 == query_res2: - number_of_matches += 1 + ref_seq1 = reference_records[first_in_pair].seq + ref_seq2 = reference_records[second_in_pair].seq + query_seq1 = query_records[first_in_pair].seq + query_seq2 = query_records[second_in_pair].seq + + for ref_res1, ref_res2, query_res1, query_res2 in zip( + ref_seq1, ref_seq2, query_seq1, query_seq2 + ): + number_of_total_pairs += 1 + if ref_res1 == query_res1 and ref_res2 == query_res2: + number_of_matches += 1 return number_of_matches, number_of_total_pairs diff --git a/phykit/services/alignment/variable_sites.py b/phykit/services/alignment/variable_sites.py index 6ac3a16..41a4d85 100644 --- a/phykit/services/alignment/variable_sites.py +++ b/phykit/services/alignment/variable_sites.py @@ -1,3 +1,4 @@ +from multiprocessing import Pool from typing import Dict, Tuple from Bio.Align import MultipleSeqAlignment @@ -10,35 +11,41 @@ def __init__(self, args) -> None: super().__init__(**self.process_args(args)) def run(self): - alignment, _, is_protein = self.get_alignment_and_format() + alignment, _, _ = self.get_alignment_and_format() var_sites, aln_len, var_sites_per = \ self.calculate_variable_sites(alignment) print(f"{var_sites}\t{aln_len}\t{round(var_sites_per, 4)}") def process_args(self, args) -> Dict[str, str]: - return dict(alignment_file_path=args.alignment) + return dict(alignment_file_path=args.alignment, cpu=args.cpu) def calculate_variable_sites( self, alignment: MultipleSeqAlignment ) -> Tuple[int, int, float]: aln_len = alignment.get_alignment_length() - gap_chars = self.get_gap_chars() - var_sites = 0 - - for i in range(aln_len): - seq_at_position = [ - residue.upper() - for residue in alignment[:, i] - if residue not in gap_chars - ] + cpu = self.set_cpu() - if len(set(seq_at_position)) > 1: - var_sites += 1 + with Pool(cpu) as pool: + results = pool.map( + self.check_site_variability, + [(alignment[:, i], gap_chars) for i in range(aln_len)] + ) + var_sites = sum(results) var_sites_per = (var_sites / aln_len) * 100 return var_sites, aln_len, var_sites_per + + def check_site_variability(self, args: Tuple[str, set]) -> int: + seq_at_position, gap_chars = args + seq_at_position = [ + residue.upper() + for residue in seq_at_position + if residue not in gap_chars + ] + + return 1 if len(set(seq_at_position)) > 1 else 0 diff --git a/tests/integration/alignment/test_sum_of_pairs_score_integration.py b/tests/integration/alignment/test_sum_of_pairs_score_integration.py index 98f823e..5fc83a4 100644 --- a/tests/integration/alignment/test_sum_of_pairs_score_integration.py +++ b/tests/integration/alignment/test_sum_of_pairs_score_integration.py @@ -12,7 +12,7 @@ class TestSumOfPairsScore(object): @patch("builtins.print") def test_sum_of_pairs_score_full_ref(self, mocked_print): - expected_result = 0.7714 + expected_result = 0.4 testargs = [ "phykit", "sum_of_pairs_score", diff --git a/tests/integration/alignment/test_variable_sites_integration.py b/tests/integration/alignment/test_variable_sites_integration.py index e595dc7..6ff9d9c 100644 --- a/tests/integration/alignment/test_variable_sites_integration.py +++ b/tests/integration/alignment/test_variable_sites_integration.py @@ -1,8 +1,7 @@ -import pytest -import sys from mock import patch, call from pathlib import Path -from textwrap import dedent +import pytest +import sys from phykit.phykit import Phykit @@ -70,4 +69,4 @@ def test_variable_sites_incorrect_input_file(self, mocked_print): Phykit() assert pytest_wrapped_e.type == SystemExit - assert pytest_wrapped_e.value.code == 2 \ No newline at end of file + assert pytest_wrapped_e.value.code == 2 diff --git a/tests/unit/services/alignment/test_variable_sites.py b/tests/unit/services/alignment/test_variable_sites.py index ab10bb8..9d291d0 100644 --- a/tests/unit/services/alignment/test_variable_sites.py +++ b/tests/unit/services/alignment/test_variable_sites.py @@ -7,7 +7,7 @@ @pytest.fixture def args(): - kwargs = dict(alignment="/some/path/to/file.fa") + kwargs = dict(alignment="/some/path/to/file.fa", cpu=1) return Namespace(**kwargs) @@ -15,6 +15,7 @@ class TestVariableSites(object): def test_init_sets_alignment_file_path(self, args): vs = VariableSites(args) assert vs.alignment_file_path == args.alignment + assert vs.cpu == (1,) assert vs.output_file_path is None def test_variable_sites(self, alignment_simple, args):