From 5cd45ac3e53701728f80f0c8928af52fb3e58285 Mon Sep 17 00:00:00 2001 From: JLSteenwyk Date: Tue, 24 Oct 2023 16:00:09 -0700 Subject: [PATCH 1/2] updated versions of biopython, numpy, and scipy --- phykit/helpers/stats_summary.py | 1 - requirements.txt | 8 ++++---- setup.py | 6 +++--- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/phykit/helpers/stats_summary.py b/phykit/helpers/stats_summary.py index 1d157cb..64a79cb 100644 --- a/phykit/helpers/stats_summary.py +++ b/phykit/helpers/stats_summary.py @@ -1,5 +1,4 @@ import statistics as stat -import sys import numpy as np diff --git a/requirements.txt b/requirements.txt index f719525..0c2c5ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -biopython>=1.76 -numpy>=1.18.2 -scipy>=1.4.1 -cython \ No newline at end of file +biopython>=1.81 +numpy>=1.24.0 +scipy>=1.11.3 +cython diff --git a/setup.py b/setup.py index 72bde12..a817cbe 100644 --- a/setup.py +++ b/setup.py @@ -18,9 +18,9 @@ ] REQUIRES = [ - "biopython>=1.79", - "numpy>=1.18.2", - "scipy>=1.4.1", + "biopython>=1.81", + "numpy>=1.24.0", + "scipy>=1.11.3", "cython" ] From e9c15952322a86f8f021416296a411a15dbf3b1e Mon Sep 17 00:00:00 2001 From: JLSteenwyk Date: Tue, 24 Oct 2023 16:55:28 -0700 Subject: [PATCH 2/2] updated and cleaned codebase. minor bump to reflect these changes --- phykit/helpers/boolean_argument_parsing.py | 9 +- phykit/helpers/files.py | 4 +- phykit/helpers/stats_summary.py | 35 +- phykit/phykit.py | 393 ++++++++++-------- phykit/services/alignment/alignment_length.py | 3 +- .../alignment/alignment_length_no_gaps.py | 13 +- phykit/services/alignment/base.py | 28 +- phykit/services/alignment/column_score.py | 51 +-- .../alignment/create_concatenation_matrix.py | 164 +++----- phykit/services/alignment/dna_threader.py | 94 ++--- phykit/services/alignment/faidx.py | 1 + phykit/services/alignment/gc_content.py | 26 +- .../services/alignment/pairwise_identity.py | 24 +- .../alignment/parsimony_informative_sites.py | 42 +- phykit/services/alignment/rcv.py | 7 +- .../alignment/rename_fasta_entries.py | 19 +- .../services/alignment/sum_of_pairs_score.py | 59 ++- phykit/services/alignment/variable_sites.py | 12 +- phykit/services/tree/__init__.py | 2 +- phykit/services/tree/base.py | 45 +- .../tree/bipartition_support_stats.py | 22 +- .../services/tree/branch_length_multiplier.py | 14 +- phykit/services/tree/collapse_branches.py | 14 +- .../tree/covarying_evolutionary_rates.py | 70 ++-- phykit/services/tree/dvmc.py | 41 +- phykit/services/tree/evolutionary_rate.py | 3 +- phykit/services/tree/hidden_paralogy_check.py | 38 +- phykit/services/tree/internal_branch_stats.py | 27 +- .../tree/last_common_ancestor_subtree.py | 2 - phykit/services/tree/lb_score.py | 44 +- phykit/services/tree/monophyly_check.py | 34 +- .../tree/nearest_neighbor_interchange.py | 20 +- phykit/services/tree/patristic_distances.py | 22 +- phykit/services/tree/polytomy_test.py | 207 ++++----- phykit/services/tree/print_tree.py | 9 +- phykit/services/tree/prune_tree.py | 15 +- phykit/services/tree/rename_tree_tips.py | 19 +- phykit/services/tree/rf_distance.py | 46 +- phykit/services/tree/root_tree.py | 13 +- phykit/services/tree/saturation.py | 59 ++- phykit/services/tree/spurious_sequence.py | 34 +- phykit/services/tree/terminal_branch_stats.py | 25 +- phykit/services/tree/tip_labels.py | 9 +- phykit/services/tree/tip_to_tip_distance.py | 6 +- .../services/tree/tip_to_tip_node_distance.py | 6 +- phykit/services/tree/total_tree_length.py | 1 - phykit/services/tree/treeness.py | 1 - phykit/services/tree/treeness_over_rcv.py | 22 +- phykit/version.py | 2 +- 49 files changed, 829 insertions(+), 1027 deletions(-) diff --git a/phykit/helpers/boolean_argument_parsing.py b/phykit/helpers/boolean_argument_parsing.py index 1e448be..aa56b65 100644 --- a/phykit/helpers/boolean_argument_parsing.py +++ b/phykit/helpers/boolean_argument_parsing.py @@ -1,11 +1,12 @@ import argparse + def str2bool(v): if isinstance(v, bool): - return v - if v.lower() in ('true', 't', '1'): + return v + if v.lower() in ("true", "t", "1"): return True - elif v.lower() in ('false', 'f', '0'): + elif v.lower() in ("false", "f", "0"): return False else: - raise argparse.ArgumentTypeError('Boolean value expected.') \ No newline at end of file + raise argparse.ArgumentTypeError("Boolean value expected.") diff --git a/phykit/helpers/files.py b/phykit/helpers/files.py index cfd9a79..2391c9e 100644 --- a/phykit/helpers/files.py +++ b/phykit/helpers/files.py @@ -32,12 +32,12 @@ def get_alignment_and_format(alignment_file_path: str): print("Please check file name and pathing") sys.exit() + def read_single_column_file_to_list(single_col_file_path: str) -> list: try: with open(single_col_file_path) as f: - return [line.rstrip('\n').strip() for line in f] + return [line.rstrip("\n").strip() for line in f] except FileNotFoundError: print(f"{single_col_file_path} corresponds to no such file or directory.") print("Please check file name and pathing") sys.exit() - diff --git a/phykit/helpers/stats_summary.py b/phykit/helpers/stats_summary.py index 64a79cb..23d7969 100644 --- a/phykit/helpers/stats_summary.py +++ b/phykit/helpers/stats_summary.py @@ -3,28 +3,25 @@ import numpy as np -def calculate_summary_statistics_from_arr( - arr -): +def calculate_summary_statistics_from_arr(arr): """ calcuate summary statistics for an input list """ stats = dict( - mean = stat.mean(arr), - median = stat.median(arr), - twenty_fifth = np.percentile(arr, 25), - seventy_fifth = np.percentile(arr, 75), - minimum = np.min(arr), - maximum = np.max(arr), - standard_deviation = stat.stdev(arr), - variance = stat.variance(arr) + mean=stat.mean(arr), + median=stat.median(arr), + twenty_fifth=np.percentile(arr, 25), + seventy_fifth=np.percentile(arr, 75), + minimum=np.min(arr), + maximum=np.max(arr), + standard_deviation=stat.stdev(arr), + variance=stat.variance(arr), ) return stats -def calculate_summary_statistics_from_dict( - dat: dict -): + +def calculate_summary_statistics_from_dict(dat: dict): """ calcuate summary statistics for a dictionary """ @@ -36,16 +33,14 @@ def calculate_summary_statistics_from_dict( minimum=np.min([*dat.values()]), maximum=np.max([*dat.values()]), standard_deviation=stat.stdev([*dat.values()]), - variance=stat.variance([*dat.values()]) + variance=stat.variance([*dat.values()]), ) return stats -def print_summary_statistics( - stats: list -): - """ - """ + +def print_summary_statistics(stats: list): + """ """ try: print(f"mean: {round(stats['mean'], 4)}") print(f"median: {round(stats['median'], 4)}") diff --git a/phykit/phykit.py b/phykit/phykit.py index 057bfa2..15e0bac 100644 --- a/phykit/phykit.py +++ b/phykit/phykit.py @@ -26,7 +26,7 @@ RelativeCompositionVariability, RenameFastaEntries, SumOfPairsScore, - VariableSites + VariableSites, ) from .services.tree import ( @@ -58,7 +58,7 @@ TipToTipNodeDistance, TotalTreeLength, Treeness, - TreenessOverRCV + TreenessOverRCV, ) from .helpers.boolean_argument_parsing import str2bool @@ -86,6 +86,7 @@ """ + class Phykit(object): help_header = f""" _____ _ _ _______ _______ @@ -103,7 +104,7 @@ class Phykit(object): Publication link: https://academic.oup.com/bioinformatics/article-abstract/37/16/2325/6131675 """ - + def __init__(self): parser = ArgumentParser( add_help=True, @@ -255,113 +256,118 @@ def __init__(self): ## Aliases def run_alias(self, command, argv): # version - if command in ['version', 'v']: + if command in ["version", "v"]: return self.version() # Alignment aliases - if command in ['aln_len', 'al']: + if command in ["aln_len", "al"]: return self.alignment_length(argv) - elif command in ['aln_len_no_gaps', 'alng']: + elif command in ["aln_len_no_gaps", "alng"]: return self.alignment_length_no_gaps(argv) - elif command in 'cs': + elif command in "cs": return self.column_score(argv) - elif command in ['get_entry', 'ge']: + elif command in ["get_entry", "ge"]: return self.faidx(argv) - elif command == 'gc': + elif command == "gc": return self.gc_content(argv) - elif command in ['pairwise_id', 'pi']: + elif command in ["pairwise_id", "pi"]: return self.pairwise_identity(argv) - elif command == 'pis': + elif command == "pis": return self.parsimony_informative_sites(argv) - elif command in ['rel_comp_var', 'relative_composition_variability']: + elif command in ["rel_comp_var", "relative_composition_variability"]: return self.rcv(argv) - elif command == 'rename_fasta': + elif command == "rename_fasta": return self.rename_fasta_entries(argv) - elif command in ['sum_of_pairs_score', 'sops', 'sop']: + elif command in ["sum_of_pairs_score", "sops", "sop"]: return self.sum_of_pairs_score(argv) - elif command == 'vs': + elif command == "vs": return self.variable_sites(argv) # Tree aliases - elif command == 'bss': + elif command == "bss": return self.bipartition_support_stats(argv) - elif command == 'blm': + elif command == "blm": return self.branch_length_multiplier(argv) - elif command in ['collapse', 'cb']: + elif command in ["collapse", "cb"]: return self.collapse_branches(argv) - elif command == 'cover': + elif command == "cover": return self.covarying_evolutionary_rates(argv) - elif command == 'degree_of_violation_of_a_molecular_clock': + elif command == "degree_of_violation_of_a_molecular_clock": return self.dvmc(argv) - elif command == 'evo_rate': + elif command == "evo_rate": return self.evolutionary_rate(argv) - elif command == 'clan_check': + elif command == "clan_check": return self.hidden_paralogy_check(argv) - elif command == 'ibs': + elif command == "ibs": return self.internal_branch_stats(argv) - elif command == 'il': + elif command == "il": return self.internode_labeler(argv) - elif command in ['lca_subtree']: + elif command in ["lca_subtree"]: return self.last_common_ancestor_subtree(argv) - elif command in ['long_branch_score', 'lbs']: + elif command in ["long_branch_score", "lbs"]: return self.lb_score(argv) - elif command == 'is_monophyletic': + elif command == "is_monophyletic": return self.monophyly_check(argv) - elif command == 'nni': + elif command == "nni": return self.nearest_neighbor_interchange(argv) - elif command == 'pd': + elif command == "pd": return self.patristic_distances(argv) - elif command in ['polyt_test', 'ptt', 'polyt']: + elif command in ["polyt_test", "ptt", "polyt"]: return self.polytomy_test(argv) - elif command in ['print', 'pt']: + elif command in ["print", "pt"]: return self.print_tree(argv) - elif command == 'prune': + elif command == "prune": return self.prune_tree(argv) - elif command in ['rename_tree', 'rename_tips']: + elif command in ["rename_tree", "rename_tips"]: return self.rename_tree_tips(argv) - elif command in ['robinson_foulds_distance', 'rf_dist', 'rf']: + elif command in ["robinson_foulds_distance", "rf_dist", "rf"]: return self.rf_distance(argv) - elif command in ['root', 'rt']: + elif command in ["root", "rt"]: return self.root_tree(argv) - elif command in ['spurious_seq', 'ss']: + elif command in ["spurious_seq", "ss"]: return self.spurious_sequence(argv) - elif command == 'tbs': + elif command == "tbs": return self.terminal_branch_stats(argv) - elif command in ['labels', 'tree_labels', 'tl']: + elif command in ["labels", "tree_labels", "tl"]: return self.tip_labels(argv) - elif command in ['t2t_dist', 't2t']: + elif command in ["t2t_dist", "t2t"]: return self.tip_to_tip_distance(argv) - elif command in ['t2t_node_dist', 't2t_nd']: + elif command in ["t2t_node_dist", "t2t_nd"]: return self.tip_to_tip_node_distance(argv) - elif command == 'tree_len': + elif command == "tree_len": return self.total_tree_length(argv) - elif command == 'tness': + elif command == "tness": return self.treeness(argv) # Alignment- and tree-based aliases - elif command == 'sat': + elif command == "sat": return self.saturation(argv) - elif command in ['toverr', 'tor']: + elif command in ["toverr", "tor"]: return self.treeness_over_rcv(argv) # Helper aliases - elif command in ['create_concat', 'cc']: + elif command in ["create_concat", "cc"]: return self.create_concatenation_matrix(argv) - elif command in ['pal2nal', 'p2n']: + elif command in ["pal2nal", "p2n"]: return self.thread_dna(argv) else: print(textwrap.dedent(help_header)) - print("Invalid command option. See help for a complete list of commands and aliases.") + print( + "Invalid command option. See help for a complete list of commands and aliases." + ) sys.exit(1) ## print version def version(self): - print(textwrap.dedent( - f"""\ + print( + textwrap.dedent( + f"""\ {self.help_header} """ - )) + ) + ) ## Alignment functions @staticmethod def alignment_length(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -398,7 +404,8 @@ def alignment_length(argv): @staticmethod def alignment_length_no_gaps(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -442,7 +449,8 @@ def alignment_length_no_gaps(argv): @staticmethod def column_score(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -477,13 +485,14 @@ def column_score(argv): ), ) parser.add_argument("fasta", type=str, help=SUPPRESS) - parser.add_argument("-r","--reference", type=str, help=SUPPRESS) + parser.add_argument("-r", "--reference", type=str, help=SUPPRESS) args = parser.parse_args(argv) ColumnScore(args).run() @staticmethod def faidx(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -516,13 +525,14 @@ def faidx(argv): ), ) parser.add_argument("fasta", type=str, help=SUPPRESS) - parser.add_argument("-e","--entry", type=str, help=SUPPRESS) + parser.add_argument("-e", "--entry", type=str, help=SUPPRESS) args = parser.parse_args(argv) Faidx(args).run() @staticmethod def gc_content(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -561,13 +571,16 @@ def gc_content(argv): ), ) parser.add_argument("fasta", type=str, help=SUPPRESS) - parser.add_argument("-v", "--verbose", action="store_true", required=False, help=SUPPRESS) + parser.add_argument( + "-v", "--verbose", action="store_true", required=False, help=SUPPRESS + ) args = parser.parse_args(argv) GCContent(args).run() @staticmethod def pairwise_identity(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -609,13 +622,16 @@ def pairwise_identity(argv): ), ) parser.add_argument("alignment", type=str, help=SUPPRESS) - parser.add_argument("-v", "--verbose", action="store_true", required=False, help=SUPPRESS) + parser.add_argument( + "-v", "--verbose", action="store_true", required=False, help=SUPPRESS + ) args = parser.parse_args(argv) PairwiseIdentity(args).run() @staticmethod def parsimony_informative_sites(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -661,7 +677,8 @@ def parsimony_informative_sites(argv): @staticmethod def rcv(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -699,7 +716,8 @@ def rcv(argv): @staticmethod def rename_fasta_entries(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -742,14 +760,15 @@ def rename_fasta_entries(argv): ), ) parser.add_argument("fasta", type=str, help=SUPPRESS) - parser.add_argument("-i","--idmap", type=str, help=SUPPRESS) + parser.add_argument("-i", "--idmap", type=str, help=SUPPRESS) parser.add_argument("-o", "--output", type=str, required=False, help=SUPPRESS) args = parser.parse_args(argv) RenameFastaEntries(args).run() @staticmethod def sum_of_pairs_score(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -784,13 +803,14 @@ def sum_of_pairs_score(argv): ), ) parser.add_argument("fasta", type=str, help=SUPPRESS) - parser.add_argument("-r","--reference", type=str, help=SUPPRESS) + parser.add_argument("-r", "--reference", type=str, help=SUPPRESS) args = parser.parse_args(argv) SumOfPairsScore(args).run() @staticmethod def variable_sites(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -832,11 +852,11 @@ def variable_sites(argv): args = parser.parse_args(argv) VariableSites(args).run() - ## Tree functions @staticmethod def bipartition_support_stats(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -871,18 +891,15 @@ def bipartition_support_stats(argv): ) parser.add_argument("tree", type=str, help=SUPPRESS) parser.add_argument( - "-v", - "--verbose", - action="store_true", - required=False, - help=SUPPRESS + "-v", "--verbose", action="store_true", required=False, help=SUPPRESS ) args = parser.parse_args(argv) BipartitionSupportStats(args).run() @staticmethod def branch_length_multiplier(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -921,18 +938,15 @@ def branch_length_multiplier(argv): ), ) parser.add_argument("tree", type=str, help=SUPPRESS) - parser.add_argument( - "-f", "--factor", - type=float, required=True, - help=SUPPRESS - ) + parser.add_argument("-f", "--factor", type=float, required=True, help=SUPPRESS) parser.add_argument("-o", "--output", type=str, required=False, help=SUPPRESS) args = parser.parse_args(argv) BranchLengthMultiplier(args).run() @staticmethod def collapse_branches(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -971,18 +985,15 @@ def collapse_branches(argv): ), ) parser.add_argument("tree", type=str, help=SUPPRESS) - parser.add_argument( - "-s", "--support", - type=float, required=True, - help=SUPPRESS - ) + parser.add_argument("-s", "--support", type=float, required=True, help=SUPPRESS) parser.add_argument("-o", "--output", type=str, required=False, help=SUPPRESS) args = parser.parse_args(argv) CollapseBranches(args).run() @staticmethod def covarying_evolutionary_rates(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1048,13 +1059,16 @@ def covarying_evolutionary_rates(argv): parser.add_argument( "-r", "--reference", type=str, required=True, help=SUPPRESS, metavar="" ) - parser.add_argument("-v", "--verbose", action="store_true", required=False, help=SUPPRESS) + parser.add_argument( + "-v", "--verbose", action="store_true", required=False, help=SUPPRESS + ) args = parser.parse_args(argv) CovaryingEvolutionaryRates(args).run() @staticmethod def dvmc(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1106,7 +1120,8 @@ def dvmc(argv): @staticmethod def evolutionary_rate(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1142,7 +1157,8 @@ def evolutionary_rate(argv): @staticmethod def hidden_paralogy_check(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1217,7 +1233,8 @@ def hidden_paralogy_check(argv): @staticmethod def internal_branch_stats(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1251,18 +1268,15 @@ def internal_branch_stats(argv): ) parser.add_argument("tree", type=str, help=SUPPRESS) parser.add_argument( - "-v", - "--verbose", - action="store_true", - required=False, - help=SUPPRESS + "-v", "--verbose", action="store_true", required=False, help=SUPPRESS ) args = parser.parse_args(argv) InternalBranchStats(args).run() @staticmethod def internode_labeler(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1299,7 +1313,8 @@ def internode_labeler(argv): @staticmethod def last_common_ancestor_subtree(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1340,7 +1355,8 @@ def last_common_ancestor_subtree(argv): @staticmethod def lb_score(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1383,13 +1399,16 @@ def lb_score(argv): ), ) parser.add_argument("tree", type=str, help=SUPPRESS) - parser.add_argument("-v", "--verbose", action="store_true", required=False, help=SUPPRESS) + parser.add_argument( + "-v", "--verbose", action="store_true", required=False, help=SUPPRESS + ) args = parser.parse_args(argv) LBScore(args).run() @staticmethod def monophyly_check(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1442,7 +1461,8 @@ def monophyly_check(argv): @staticmethod def nearest_neighbor_interchange(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1486,7 +1506,8 @@ def nearest_neighbor_interchange(argv): @staticmethod def patristic_distances(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1523,13 +1544,16 @@ def patristic_distances(argv): ), ) parser.add_argument("tree", type=str, help=SUPPRESS) - parser.add_argument("-v", "--verbose", action="store_true", required=False, help=SUPPRESS) + parser.add_argument( + "-v", "--verbose", action="store_true", required=False, help=SUPPRESS + ) args = parser.parse_args(argv) - PatristicDistances(args).run() + PatristicDistances(args).run() @staticmethod def polytomy_test(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1587,11 +1611,12 @@ def polytomy_test(argv): parser.add_argument("-t", "--trees", type=str, help=SUPPRESS) parser.add_argument("-g", "--groups", type=str, help=SUPPRESS) args = parser.parse_args(argv) - PolytomyTest(args).run() + PolytomyTest(args).run() @staticmethod def print_tree(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1625,13 +1650,16 @@ def print_tree(argv): ), ) parser.add_argument("tree", type=str, help=SUPPRESS) - parser.add_argument("-r", "--remove", action="store_true", required=False, help=SUPPRESS) + parser.add_argument( + "-r", "--remove", action="store_true", required=False, help=SUPPRESS + ) args = parser.parse_args(argv) PrintTree(args).run() @staticmethod def prune_tree(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1680,18 +1708,15 @@ def prune_tree(argv): parser.add_argument("list_of_taxa", type=str, help=SUPPRESS) parser.add_argument("-o", "--output", type=str, required=False, help=SUPPRESS) parser.add_argument( - "-k", "--keep", - type=str2bool, - nargs='?', - default=False, - help=SUPPRESS + "-k", "--keep", type=str2bool, nargs="?", default=False, help=SUPPRESS ) args = parser.parse_args(argv) PruneTree(args).run() @staticmethod def rename_tree_tips(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1734,14 +1759,15 @@ def rename_tree_tips(argv): ), ) parser.add_argument("tree", type=str, help=SUPPRESS) - parser.add_argument("-i","--idmap", type=str, help=SUPPRESS) + parser.add_argument("-i", "--idmap", type=str, help=SUPPRESS) parser.add_argument("-o", "--output", type=str, required=False, help=SUPPRESS) args = parser.parse_args(argv) RenameTreeTips(args).run() @staticmethod def rf_distance(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1794,7 +1820,8 @@ def rf_distance(argv): @staticmethod def root_tree(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1844,7 +1871,8 @@ def root_tree(argv): @staticmethod def spurious_sequence(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1894,17 +1922,14 @@ def spurious_sequence(argv): ), ) parser.add_argument("tree", type=str, help=SUPPRESS) - parser.add_argument( - "-f", "--factor", - type=float, required=False, - help=SUPPRESS - ) + parser.add_argument("-f", "--factor", type=float, required=False, help=SUPPRESS) args = parser.parse_args(argv) SpuriousSequence(args).run() @staticmethod def terminal_branch_stats(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1938,18 +1963,15 @@ def terminal_branch_stats(argv): ) parser.add_argument("tree", type=str, help=SUPPRESS) parser.add_argument( - "-v", - "--verbose", - action="store_true", - required=False, - help=SUPPRESS + "-v", "--verbose", action="store_true", required=False, help=SUPPRESS ) args = parser.parse_args(argv) TerminalBranchStats(args).run() @staticmethod def tip_labels(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -1980,7 +2002,8 @@ def tip_labels(argv): @staticmethod def tip_to_tip_distance(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -2023,7 +2046,8 @@ def tip_to_tip_distance(argv): @staticmethod def tip_to_tip_node_distance(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -2067,7 +2091,8 @@ def tip_to_tip_node_distance(argv): @staticmethod def total_tree_length(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -2098,7 +2123,8 @@ def total_tree_length(argv): @staticmethod def treeness(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -2142,7 +2168,8 @@ def treeness(argv): ## Alignment and tree functions @staticmethod def saturation(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -2187,13 +2214,16 @@ def saturation(argv): parser.add_argument( "-t", "--tree", type=str, required=True, help=SUPPRESS, metavar="" ) - parser.add_argument("-v", "--verbose", action="store_true", required=False, help=SUPPRESS) + parser.add_argument( + "-v", "--verbose", action="store_true", required=False, help=SUPPRESS + ) args = parser.parse_args(argv) Saturation(args).run() @staticmethod def treeness_over_rcv(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -2242,7 +2272,8 @@ def treeness_over_rcv(argv): ### Helper commands @staticmethod def create_concatenation_matrix(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -2283,22 +2314,15 @@ def create_concatenation_matrix(argv): """ ), ) - parser.add_argument( - "-a", "--alignment_list", - type=str, - help=SUPPRESS - ) - parser.add_argument( - "-p", "--prefix", - type=str, - help=SUPPRESS - ) + parser.add_argument("-a", "--alignment_list", type=str, help=SUPPRESS) + parser.add_argument("-p", "--prefix", type=str, help=SUPPRESS) args = parser.parse_args(argv) CreateConcatenationMatrix(args).run() @staticmethod def thread_dna(argv): - parser = ArgumentParser(add_help=True, + parser = ArgumentParser( + add_help=True, usage=SUPPRESS, formatter_class=RawDescriptionHelpFormatter, description=textwrap.dedent( @@ -2345,162 +2369,193 @@ def thread_dna(argv): """ ), ) + parser.add_argument("-p", "--protein", type=str, help=SUPPRESS) + parser.add_argument("-n", "--nucleotide", type=str, help=SUPPRESS) parser.add_argument( - "-p", "--protein", - type=str, - help=SUPPRESS - ) - parser.add_argument( - "-n", "--nucleotide", - type=str, - help=SUPPRESS - ) - parser.add_argument( - "-c", "--clipkit_log_file", + "-c", + "--clipkit_log_file", type=str, required=False, help=SUPPRESS, ) parser.add_argument( - "-s", "--stop", - type=str2bool, - nargs='?', - default=True, - help=SUPPRESS + "-s", "--stop", type=str2bool, nargs="?", default=True, help=SUPPRESS ) args = parser.parse_args(argv) DNAThreader(args).run() + def main(argv=None): Phykit() + # Alignment-based functions def alignment_length(argv=None): Phykit.alignment_length(sys.argv[1:]) + def alignment_length_no_gaps(argv=None): Phykit.alignment_length_no_gaps(sys.argv[1:]) + def column_score(argv=None): Phykit.column_score(sys.argv[1:]) + def faidx(argv=None): Phykit.faidx(sys.argv[1:]) + def gc_content(argv=None): Phykit.gc_content(sys.argv[1:]) + def pairwise_identity(argv=None): Phykit.pairwise_identity(sys.argv[1:]) + def parsimony_informative_sites(argv=None): Phykit.parsimony_informative_sites(sys.argv[1:]) + def rcv(argv=None): Phykit.rcv(sys.argv[1:]) + def rename_fasta_entries(argv=None): Phykit.rename_fasta_entries(sys.argv[1:]) + def sum_of_pairs_score(argv=None): Phykit.sum_of_pairs_score(sys.argv[1:]) + def variable_sites(argv=None): Phykit.variable_sites(sys.argv[1:]) + # Tree-based functions def bipartition_support_stats(argv=None): Phykit.bipartition_support_stats(sys.argv[1:]) + def branch_length_multiplier(argv=None): Phykit.branch_length_multiplier(sys.argv[1:]) + def collapse_branches(argv=None): Phykit.collapse_branches(sys.argv[1:]) + def covarying_evolutionary_rates(argv=None): Phykit.covarying_evolutionary_rates(sys.argv[1:]) + def dvmc(argv=None): Phykit.dvmc(sys.argv[1:]) + def evolutionary_rate(argv=None): Phykit.evolutionary_rate(sys.argv[1:]) + def hidden_paralogy_check(argv=None): Phykit.hidden_paralogy_check(sys.argv[1:]) + def internal_branch_stats(argv=None): Phykit.internal_branch_stats(sys.argv[1:]) + def internode_labeler(argv=None): Phykit.internode_labeler(sys.argv[1:]) + def last_common_ancestor_subtree(argv=None): Phykit.last_common_ancestor_subtree(sys.argv[1:]) + def lb_score(argv=None): Phykit.lb_score(sys.argv[1:]) + def monophyly_check(argv=None): Phykit.monophyly_check(sys.argv[1:]) + def nearest_neighbor_interchange(argv=None): Phykit.nearest_neighbor_interchange(sys.argv[1:]) + def patristic_distances(argv=None): Phykit.patristic_distances(sys.argv[1:]) + def polytomy_test(argv=None): Phykit.polytomy_test(sys.argv[1:]) + def print_tree(argv=None): Phykit.print_tree(sys.argv[1:]) + def prune_tree(argv=None): Phykit.prune_tree(sys.argv[1:]) + def rename_tree_tips(argv=None): Phykit.rename_tree_tips(sys.argv[1:]) + def rf_distance(argv=None): Phykit.rf_distance(sys.argv[1:]) + def root_tree(argv=None): Phykit.root_tree(sys.argv[1:]) + def spurious_sequence(argv=None): Phykit.spurious_sequence(sys.argv[1:]) + def terminal_branch_stats(argv=None): Phykit.terminal_branch_stats(sys.argv[1:]) + def tip_labels(argv=None): Phykit.tip_labels(sys.argv[1:]) + def tip_to_tip_distance(argv=None): Phykit.tip_to_tip_distance(sys.argv[1:]) + def tip_to_tip_node_distance(argv=None): Phykit.tip_to_tip_node_distance(sys.argv[1:]) + def total_tree_length(argv=None): Phykit.total_tree_length(sys.argv[1:]) + def treeness(argv=None): Phykit.treeness(sys.argv[1:]) + # Alignment- and tree-based functions def saturation(argv=None): Phykit.saturation(sys.argv[1:]) + def treeness_over_rcv(argv=None): Phykit.treeness_over_rcv(sys.argv[1:]) + # Helper functions def create_concatenation_matrix(argv=None): Phykit.create_concatenation_matrix(sys.argv[1:]) + def thread_dna(argv=None): Phykit.thread_dna(sys.argv[1:]) - diff --git a/phykit/services/alignment/alignment_length.py b/phykit/services/alignment/alignment_length.py index 7dd8c18..b30225d 100644 --- a/phykit/services/alignment/alignment_length.py +++ b/phykit/services/alignment/alignment_length.py @@ -1,7 +1,6 @@ -from Bio.Align import MultipleSeqAlignment - from .base import Alignment + class AlignmentLength(Alignment): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) diff --git a/phykit/services/alignment/alignment_length_no_gaps.py b/phykit/services/alignment/alignment_length_no_gaps.py index 3a169d2..eb49451 100644 --- a/phykit/services/alignment/alignment_length_no_gaps.py +++ b/phykit/services/alignment/alignment_length_no_gaps.py @@ -1,14 +1,17 @@ -from Bio.Align import MultipleSeqAlignment - from .base import Alignment + class AlignmentLengthNoGaps(Alignment): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) def run(self): alignment, alignment_format = self.get_alignment_and_format() - aln_len_no_gaps, aln_len, aln_len_no_gaps_per = self.calculate_alignment_length_no_gaps(alignment) + ( + aln_len_no_gaps, + aln_len, + aln_len_no_gaps_per, + ) = self.calculate_alignment_length_no_gaps(alignment) print(f"{aln_len_no_gaps}\t{aln_len}\t{round(aln_len_no_gaps_per, 4)}") def process_args(self, args): @@ -20,7 +23,7 @@ def calculate_alignment_length_no_gaps(self, alignment): # calculate percent of variable sites aln_len_no_gaps_per = (aln_len_no_gaps / aln_len) * 100 - + return aln_len_no_gaps, aln_len, aln_len_no_gaps_per def get_sites_no_gaps_count(self, alignment, aln_len): @@ -34,4 +37,4 @@ def get_sites_no_gaps_count(self, alignment, aln_len): if "-" not in seq_at_position: aln_len_no_gaps += 1 - return aln_len_no_gaps \ No newline at end of file + return aln_len_no_gaps diff --git a/phykit/services/alignment/base.py b/phykit/services/alignment/base.py index ba358ec..88f87df 100644 --- a/phykit/services/alignment/base.py +++ b/phykit/services/alignment/base.py @@ -1,7 +1,10 @@ import sys from ..base import BaseService -from ...helpers.files import get_alignment_and_format as get_alignment_and_format_helper +from ...helpers.files import ( + get_alignment_and_format as get_alignment_and_format_helper +) + class Alignment(BaseService): def __init__( @@ -22,8 +25,8 @@ def __init__( ): self.alignment_file_path = alignment_file_path self.output_file_path = output_file_path - self.protein_file_path = protein_file_path, - self.nucleotide_file_path = nucleotide_file_path + self.protein_file_path = (protein_file_path,) + self.nucleotide_file_path = nucleotide_file_path self.alignment_list_path = alignment_list_path self.prefix = prefix self.fasta = fasta @@ -43,20 +46,20 @@ def get_alignment_and_format(self): print("Input corresponds to no such file or directory.") print("Please double check pathing and filenames") sys.exit() - + def calculate_rcv(self): alignment, _ = self.get_alignment_and_format() aln_len = alignment.get_alignment_length() # string to hold all sequences - concat_seq = '' + concat_seq = "" # initialize a counter for the number of sequences in the input fasta file num_records = 0 - # for each record join concatSeq string and sequence as well as keeping track + # for each record join concatSeq string and sequence as well as keeping track # of the number of records for record in alignment: - concat_seq += record.seq + concat_seq += record.seq num_records += 1 # dictionary to hold the average occurence of each sequence letter @@ -64,13 +67,13 @@ def calculate_rcv(self): # loop through the different sequences that appear in the fasta file # population dictionary with how many times that sequence appears for seq in set(concat_seq): - average_d[seq] = (concat_seq.count(seq)/num_records) + average_d[seq] = concat_seq.count(seq) / num_records - # intiailize list to hold the RCV values per ith taxa + # intiailize list to hold the RCV values per ith taxa # that will later be summed indiv_rcv_values = [] - # loop through records again and calculate RCV for + # loop through records again and calculate RCV for # each taxa and append to indivRCVvalues for record in alignment: # temp holds a temporary value of the numerator before appending @@ -79,11 +82,10 @@ def calculate_rcv(self): temp = 0 # calculates the absolute value of the ith sequence letter minus the average for seq_letter in set(concat_seq): - temp += abs(record.seq.count(seq_letter)-average_d[seq_letter]) - indiv_rcv_values.append(temp/(num_records*aln_len)) + temp += abs(record.seq.count(seq_letter) - average_d[seq_letter]) + indiv_rcv_values.append(temp / (num_records * aln_len)) relative_composition_variability = sum(indiv_rcv_values) # print the sum of all RCV values return relative_composition_variability - diff --git a/phykit/services/alignment/column_score.py b/phykit/services/alignment/column_score.py index cede8bd..736ddbe 100644 --- a/phykit/services/alignment/column_score.py +++ b/phykit/services/alignment/column_score.py @@ -1,70 +1,61 @@ from collections import Counter -import itertools -import sys -from Bio import SeqIO, AlignIO +from Bio import AlignIO from .base import Alignment + class ColumnScore(Alignment): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) def run(self): - # create biopython object of sequences for query and + # create biopython object of sequences for query and # reference alignments query_records = AlignIO.read(self.fasta, "fasta") reference_records = AlignIO.read(self.reference, "fasta") # create lists with strings of every columns - ref_columns, query_columns = self.get_columns_from_alignments(reference_records, query_records) - + ref_columns, query_columns = self.get_columns_from_alignments( + reference_records, query_records + ) + # count the number of matches and total pairs - number_of_matches, number_of_total_columns = self.calculate_matches_between_ref_and_query_columns(ref_columns, query_columns) + ( + number_of_matches, + number_of_total_columns, + ) = self.calculate_matches_between_ref_and_query_columns( + ref_columns, query_columns + ) # print res - print(round(number_of_matches/number_of_total_columns, 4)) + print(round(number_of_matches / number_of_total_columns, 4)) def process_args(self, args) -> dict: - return dict( - fasta=args.fasta, - reference=args.reference - ) + return dict(fasta=args.fasta, reference=args.reference) - def get_columns_from_alignments( - self, - reference_records, - query_records - ): + def get_columns_from_alignments(self, reference_records, query_records): # loop through reference alignment and get each column ref_columns = [] for i in range(reference_records.get_alignment_length()): - ref_seq_at_position = '' - ref_seq_at_position += reference_records[:,i] + ref_seq_at_position = "" + ref_seq_at_position += reference_records[:, i] ref_seq_at_position = ref_seq_at_position.upper() ref_columns.append(ref_seq_at_position) # loop through query alignment and get each column query_columns = [] for i in range(query_records.get_alignment_length()): - query_seq_at_position = '' - query_seq_at_position += query_records[:,i] + query_seq_at_position = "" + query_seq_at_position += query_records[:, i] query_seq_at_position = query_seq_at_position.upper() query_columns.append(query_seq_at_position) return ref_columns, query_columns def calculate_matches_between_ref_and_query_columns( - self, - ref_columns: list, - query_columns :list + self, ref_columns: list, query_columns: list ): # determine the number of matches matches = list((Counter(ref_columns) & Counter(query_columns)).elements()) return len(matches), len(query_columns) - - - - - - diff --git a/phykit/services/alignment/create_concatenation_matrix.py b/phykit/services/alignment/create_concatenation_matrix.py index 7f0a104..8b0729f 100644 --- a/phykit/services/alignment/create_concatenation_matrix.py +++ b/phykit/services/alignment/create_concatenation_matrix.py @@ -1,15 +1,14 @@ -import statistics as stat import sys from textwrap import dedent from typing import Tuple -from Bio import SeqIO, SeqRecord -from Bio.Seq import Seq +from Bio import SeqIO from .base import Alignment from ...helpers.files import read_single_column_file_to_list + class CreateConcatenationMatrix(Alignment): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) @@ -18,17 +17,11 @@ def run(self): self.create_concatenation_matrix(self.alignment_list_path, self.prefix) def process_args(self, args) -> dict: - return dict( - alignment_list_path=args.alignment_list, - prefix=args.prefix - ) + return dict(alignment_list_path=args.alignment_list, prefix=args.prefix) - def read_alignment_paths( - self, - alignment_list_path: str - ) -> list: + def read_alignment_paths(self, alignment_list_path: str) -> list: """ - Read alignment paths into a list + Read alignment paths into a list """ try: return read_single_column_file_to_list(alignment_list_path) @@ -36,10 +29,7 @@ def read_alignment_paths( print("Alignment list file (-a) is not found. Please check pathing.") sys.exit() - def get_taxa_names( - self, - alignment_paths: list - ) -> list: + def get_taxa_names(self, alignment_paths: list) -> list: # get taxa names taxa = set() for alignment_path in alignment_paths: @@ -55,14 +45,15 @@ def print_start_message( alignment_paths: list, file_partition: str, fasta_output: str, - file_occupancy: str + file_occupancy: str, ) -> None: """ print a log message to the user about general features of input and output file names """ - start_message = dedent(f""" + start_message = dedent( + f""" -------------------- | General features | -------------------- @@ -76,16 +67,12 @@ def print_start_message( Partition file output: {file_partition} Concatenated fasta output: {fasta_output} Occupancy report: {file_occupancy} - """) + """ + ) print(start_message) - - - def get_list_of_taxa_and_records( - self, - alignment_path: str - ) -> Tuple[list, list]: + def get_list_of_taxa_and_records(self, alignment_path: str) -> Tuple[list, list]: """ get list of a taxa and their records """ @@ -101,54 +88,41 @@ def get_list_of_taxa_and_records( return og_taxa, records - def determine_missing_sequences( - self, - taxa: list, - og_taxa: list - ) -> list: + def determine_missing_sequences(self, taxa: list, og_taxa: list) -> list: """ determine taxa not present in an alignment file """ - missing_taxa = list(set(taxa)-set(og_taxa)) + missing_taxa = list(set(taxa) - set(og_taxa)) return missing_taxa def create_missing_seq_str( - self, - records: list, - alignment_path: str + self, records: list, alignment_path: str ) -> Tuple[str, int]: """ create a placeholder string for sequences with missing taxa """ - og_len = '' - missing_seq = '' + og_len = "" + missing_seq = "" try: og_len = len(records[0].seq) except IndexError: print(f"There are no sequence records in {alignment_path}.\nExiting now...") sys.exit() - missing_seq = og_len * '?' - + missing_seq = og_len * "?" + return missing_seq, og_len def create_missing_taxa_sequence( - self, - missing_taxa: list, - missing_seq: str, - concat: dict + self, missing_taxa: list, missing_seq: str, concat: dict ) -> dict: - for indiv in missing_taxa: concat[indiv].append(missing_seq) - - return concat + + return concat def create_record_for_present_taxa( - self, - records: list, - og_taxa: list, - concat: dict + self, records: list, og_taxa: list, concat: dict ) -> dict: """ create a record for taxa that are present in the concatenation dict @@ -166,7 +140,7 @@ def add_to_partition_info( field_one: str, fasta: str, first_len: int, - second_len: int + second_len: int, ) -> Tuple[list, int, int]: """ append to partition file @@ -182,11 +156,7 @@ def add_to_partition_info( return partition_info, first_len, second_len def add_to_occupancy_info( - self, - occupancy_info: list, - og_taxa: list, - missing_taxa: list, - fasta: str + self, occupancy_info: list, og_taxa: list, missing_taxa: list, fasta: str ) -> list: """ write partition file to detail boundaries in the concat alignment @@ -197,7 +167,7 @@ def add_to_occupancy_info( # determine number of missing taxa num_missing = len(missing_taxa) # determine percent occupied - percent_occupancy = num_present/(num_present+num_missing) + percent_occupancy = num_present / (num_present + num_missing) # handle missing taxa if num_missing == 0: missing_taxa = ["None"] @@ -206,18 +176,14 @@ def add_to_occupancy_info( og_taxa.sort() entry = f"{str(fasta)}\t{str(num_present)}\t{str(num_missing)}\t{str(percent_occupancy)}\t{';'.join(missing_taxa)}\t{';'.join(og_taxa)}\n" occupancy_info.append(entry) - + return occupancy_info - def fasta_file_write( - self, - fasta_output: str, - concat: dict - ) -> None: + def fasta_file_write(self, fasta_output: str, concat: dict) -> None: """ - join seqs of genes in value (list) and write to concat_fa + join seqs of genes in value (list) and write to concat_fa """ - with open(fasta_output, "w") as final_fasta_file: + with open(fasta_output, "w") as final_fasta_file: for x in concat: concatenated = [] for s in concat[x]: @@ -227,14 +193,12 @@ def fasta_file_write( except AttributeError: # if a string concatenated.append(str(s)) - concat[x] = ''.join(concatenated) + concat[x] = "".join(concatenated) entry = f">{x}\n{''.join(str(concat[x]))}\n" - final_fasta_file.write(str(entry)) + final_fasta_file.write(str(entry)) def write_occupancy_or_partition_file( - self, - list_of_lists: list, - output_file_name: str + self, list_of_lists: list, output_file_name: str ) -> None: """ loops through occupancy or partition file information and writes @@ -246,9 +210,7 @@ def write_occupancy_or_partition_file( f.write(entry) def create_concatenation_matrix( - self, - alignment_list_path: str, - prefix: str + self, alignment_list_path: str, prefix: str ) -> None: alignment_paths = self.read_alignment_paths(alignment_list_path) @@ -261,85 +223,57 @@ def create_concatenation_matrix( # print log message self.print_start_message( - taxa, - alignment_paths, - file_partition, - fasta_output, - file_occupancy + taxa, alignment_paths, file_partition, fasta_output, file_occupancy ) # assign placeholders for lengths in the partition file - first_len = 1 - second_len = 0 + first_len = 1 + second_len = 0 # create a dictionary with keys as taxa and empty list values concatenated_seqs = {key: [] for key in taxa} # string for first field of partition file - field_one = 'AUTO' + field_one = "AUTO" - # create empty lists to contain information for the + # create empty lists to contain information for the # partition and occupancy files partition_info = [] occupancy_info = [] # loop through alignment files for alignment_path in alignment_paths: - # create a list of taxa and records - og_taxa, records = self.get_list_of_taxa_and_records( - alignment_path - ) - + og_taxa, records = self.get_list_of_taxa_and_records(alignment_path) + # initialize list and determine missing taxa - missing_taxa = self.determine_missing_sequences( - taxa, - og_taxa - ) + missing_taxa = self.determine_missing_sequences(taxa, og_taxa) # create string for missing seq - missing_seq, og_len = self.create_missing_seq_str( - records, - alignment_path - ) + missing_seq, og_len = self.create_missing_seq_str(records, alignment_path) # create missing taxa sequence concatenated_seqs = self.create_missing_taxa_sequence( - missing_taxa, - missing_seq, - concatenated_seqs - ) + missing_taxa, missing_seq, concatenated_seqs + ) # create record for present data concatenated_seqs = self.create_record_for_present_taxa( - records, - og_taxa, - concatenated_seqs + records, og_taxa, concatenated_seqs ) # append partition to to partition info partition_info, first_len, second_len = self.add_to_partition_info( - partition_info, - og_len, - field_one, - alignment_path, - first_len, - second_len + partition_info, og_len, field_one, alignment_path, first_len, second_len ) # append occupancy data to occupancy info occupancy_info = self.add_to_occupancy_info( - occupancy_info, - og_taxa, - missing_taxa, - alignment_path + occupancy_info, og_taxa, missing_taxa, alignment_path ) # write output fasta file - self.fasta_file_write( - fasta_output, - concatenated_seqs - ) + self.fasta_file_write(fasta_output, concatenated_seqs) # write out occupancy and partition file self.write_occupancy_or_partition_file(occupancy_info, file_occupancy) diff --git a/phykit/services/alignment/dna_threader.py b/phykit/services/alignment/dna_threader.py index fe92110..85d73a6 100644 --- a/phykit/services/alignment/dna_threader.py +++ b/phykit/services/alignment/dna_threader.py @@ -14,7 +14,6 @@ def __init__(self, args) -> None: self.process_args(args) def process_args(self, args): - self.include_stop_codon = args.stop self.protein_file_path = args.protein self.nucleotide_file_path = args.nucleotide @@ -25,39 +24,36 @@ def run(self): nucl = self.read_file(self.nucleotide_file_path) if self.clipkit_log_file is not None: - clipkit_log = [line.split(' ')[1] for line in open(self.clipkit_log_file).readlines()] + clipkit_log = [ + line.split(" ")[1] for line in open(self.clipkit_log_file).readlines() + ] else: clipkit_log = self.clipkit_log_file pal2nal = self.thread(prot, nucl, clipkit_log) for record in pal2nal: - sequence = ''.join(pal2nal[record]) + sequence = "".join(pal2nal[record]) print(f">{record}") print(f"{sequence}") - def read_file( - self, - file_path: str, - file_format: str = "fasta" - ) -> SeqRecord: + def read_file(self, file_path: str, file_format: str = "fasta") -> SeqRecord: return SeqIO.parse(file_path, file_format) - def thread( - self, - protein: SeqRecord, - nucleotide: SeqRecord, - clipkit_log - ) -> dict: + def thread(self, protein: SeqRecord, nucleotide: SeqRecord, clipkit_log) -> dict: # protein alignment to nucleotide alignment pal2nal = {} - + try: # when ClipKIT log file is provided if clipkit_log: - for protein_seq_record, nucleotide_seq_record in zip(protein, nucleotide): - # get gene id and sequences - gene_id, p_seq, n_seq = self.get_id_and_seqs(protein_seq_record, nucleotide_seq_record) + for protein_seq_record, nucleotide_seq_record in zip( + protein, nucleotide + ): + # get gene id and sequences + gene_id, p_seq, n_seq = self.get_id_and_seqs( + protein_seq_record, nucleotide_seq_record + ) # initialize gap counter and gene in pal2nal dict gap_count = 0 @@ -77,10 +73,7 @@ def thread( # if AA is not a gap, insert the corresponding codon if p_seq[aa_idx] != "-": pal2nal = self.add_codon_when_log_file_is_used( - nucl_idx, - n_seq, - gene_id, - pal2nal + nucl_idx, n_seq, gene_id, pal2nal ) else: # if AA is a stop or ambiguous insert a codon of gaps @@ -88,18 +81,19 @@ def thread( pal2nal = self.add_gap(pal2nal, gene_id) else: pal2nal = self.add_codon_when_log_file_is_used( - nucl_idx, - n_seq, - gene_id, - pal2nal + nucl_idx, n_seq, gene_id, pal2nal ) aa_idx += 1 nucl_idx += 3 else: - for protein_seq_record, nucleotide_seq_record in zip(protein, nucleotide): + for protein_seq_record, nucleotide_seq_record in zip( + protein, nucleotide + ): # get gene id and sequences - gene_id, p_seq, n_seq = self.get_id_and_seqs(protein_seq_record, nucleotide_seq_record) + gene_id, p_seq, n_seq = self.get_id_and_seqs( + protein_seq_record, nucleotide_seq_record + ) # initialize gap counter and gene in pal2nal dict gap_count = 0 @@ -116,11 +110,7 @@ def thread( # if AA is not a gap, insert the corresponding codon if p_seq[aa_idx] != "-": pal2nal = self.add_codon( - aa_idx, - gap_count, - n_seq, - gene_id, - pal2nal + aa_idx, gap_count, n_seq, gene_id, pal2nal ) else: # if AA is a stop or ambiguous insert a codon of gaps @@ -128,11 +118,7 @@ def thread( pal2nal = self.add_gap(pal2nal, gene_id) else: pal2nal = self.add_codon( - aa_idx, - gap_count, - n_seq, - gene_id, - pal2nal + aa_idx, gap_count, n_seq, gene_id, pal2nal ) return pal2nal except FileNotFoundError: @@ -144,9 +130,7 @@ def thread( pass def get_id_and_seqs( - self, - protein_seq_record: SeqRecord, - nucleotide_seq_record: SeqRecord + self, protein_seq_record: SeqRecord, nucleotide_seq_record: SeqRecord ): """ get gene id, protein sequence, and nucleotide sequence @@ -163,11 +147,7 @@ def get_id_and_seqs( return gene_id, p_seq, n_seq - def add_gap( - self, - pal2nal: dict, - gene_id: str - ) -> dict: + def add_gap(self, pal2nal: dict, gene_id: str) -> dict: """ add a gap to the growing sequence """ @@ -177,19 +157,14 @@ def add_gap( return pal2nal def add_codon( - self, - aa_idx: int, - gap_count: int, - n_seq: SeqRecord, - gene_id: str, - pal2nal: dict + self, aa_idx: int, gap_count: int, n_seq: SeqRecord, gene_id: str, pal2nal: dict ) -> dict: """ add a gap to the growing sequence """ nt_window = (aa_idx - gap_count) * 3 - seq = n_seq[nt_window : nt_window + 3]._data + seq = n_seq[nt_window:nt_window + 3]._data seq = seq.decode("utf-8") if not len(seq): seq = "---" @@ -198,25 +173,16 @@ def add_codon( return pal2nal def add_codon_when_log_file_is_used( - self, - nucl_idx: int, - n_seq: SeqRecord, - gene_id: str, - pal2nal: dict + self, nucl_idx: int, n_seq: SeqRecord, gene_id: str, pal2nal: dict ) -> dict: """ add a gap to the growing sequence """ - seq = n_seq[nucl_idx : nucl_idx + 3]._data + seq = n_seq[nucl_idx:nucl_idx + 3]._data seq = seq.decode("utf-8") if not len(seq): seq = "---" pal2nal[gene_id].append(seq) return pal2nal - - - - - diff --git a/phykit/services/alignment/faidx.py b/phykit/services/alignment/faidx.py index 98e23b9..0acc51a 100644 --- a/phykit/services/alignment/faidx.py +++ b/phykit/services/alignment/faidx.py @@ -2,6 +2,7 @@ from .base import Alignment + class Faidx(Alignment): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) diff --git a/phykit/services/alignment/gc_content.py b/phykit/services/alignment/gc_content.py index 915d3d7..fc9dc43 100644 --- a/phykit/services/alignment/gc_content.py +++ b/phykit/services/alignment/gc_content.py @@ -2,12 +2,11 @@ import re import sys -from Bio import SeqIO - from .base import Alignment from ...helpers.files import get_alignment_and_format + class FileFormat(Enum): fasta = "fasta" clustal = "clustal" @@ -17,6 +16,7 @@ class FileFormat(Enum): phylip_seq = "phylip-sequential" stockholm = "stockholm" + class GCContent(Alignment): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) @@ -24,7 +24,7 @@ def __init__(self, args) -> None: def run(self): # create biopython object of sequences records, _ = get_alignment_and_format(self.fasta) - + # initialize and populate dict for # holding the entry sequences entry_and_seq = {} @@ -42,13 +42,15 @@ def run(self): all_seqs = [] for entry, seq in entry_and_seq.items(): all_seqs.append(seq) - all_seqs = ''.join(all_seqs) + all_seqs = "".join(all_seqs) all_seqs, matches = self.find_matches_and_remove_gaps(all_seqs) try: - gc_content = round(len(matches)/len(all_seqs), 4) + gc_content = round(len(matches) / len(all_seqs), 4) except ZeroDivisionError: try: - print("Input file has an unacceptable format. Please check input file argument.") + print( + "Input file has an unacceptable format. Please check input file argument." + ) sys.exit() except BrokenPipeError: pass @@ -56,17 +58,13 @@ def run(self): print(gc_content) except BrokenPipeError: pass - def process_args(self, args): - return dict( - fasta=args.fasta, - verbose=args.verbose - ) + return dict(fasta=args.fasta, verbose=args.verbose) def find_matches_and_remove_gaps(self, seq: str): - regex_pattern = re.compile('[GgCc]') - seq = seq.replace('-', '') - seq = seq.replace('?', '') + regex_pattern = re.compile("[GgCc]") + seq = seq.replace("-", "") + seq = seq.replace("?", "") matches = regex_pattern.findall(seq) return seq, matches diff --git a/phykit/services/alignment/pairwise_identity.py b/phykit/services/alignment/pairwise_identity.py index dd07b93..82d5a74 100644 --- a/phykit/services/alignment/pairwise_identity.py +++ b/phykit/services/alignment/pairwise_identity.py @@ -1,11 +1,11 @@ import itertools -import statistics as stat - -from Bio.Align import MultipleSeqAlignment -import numpy as np from .base import Alignment -from ...helpers.stats_summary import calculate_summary_statistics_from_dict, print_summary_statistics +from ...helpers.stats_summary import ( + calculate_summary_statistics_from_dict, + print_summary_statistics, +) + class PairwiseIdentity(Alignment): def __init__(self, args) -> None: @@ -14,15 +14,17 @@ def __init__(self, args) -> None: def run(self): # get aln alignment, alignment_format = self.get_alignment_and_format() - + # get entry indices entries = self.get_entry_indices(alignment) # determine pairwise combinations of entry indices combos = list(itertools.combinations(entries, 2)) - pairwise_identities, stats = self.calculate_pairwise_identities(alignment, combos) - + pairwise_identities, stats = self.calculate_pairwise_identities( + alignment, combos + ) + if self.verbose: try: for pair, identity in pairwise_identities.items(): @@ -40,7 +42,7 @@ def get_entry_indices(self, alignment) -> list: entries_count = 0 for record in alignment: entries.append(entries_count) - entries_count+=1 + entries_count += 1 return entries @@ -57,9 +59,9 @@ def calculate_pairwise_identities(self, alignment, combos): for idx in range(0, aln_len): if seq_one[idx] == seq_two[idx]: identities += 1 - ids = alignment[combo[0]].id + '-' + alignment[combo[1]].id + ids = alignment[combo[0]].id + "-" + alignment[combo[1]].id pairwise_identities[ids] = identities / aln_len stats = calculate_summary_statistics_from_dict(pairwise_identities) - return pairwise_identities, stats \ No newline at end of file + return pairwise_identities, stats diff --git a/phykit/services/alignment/parsimony_informative_sites.py b/phykit/services/alignment/parsimony_informative_sites.py index 3051210..007e683 100644 --- a/phykit/services/alignment/parsimony_informative_sites.py +++ b/phykit/services/alignment/parsimony_informative_sites.py @@ -1,29 +1,22 @@ -import sys - -from Bio.Align import MultipleSeqAlignment - from .base import Alignment + class ParsimonyInformative(Alignment): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) def run(self): alignment, _ = self.get_alignment_and_format() - pi_sites, aln_len, pi_sites_per = self.calculate_parsimony_informative_sites(alignment) + pi_sites, aln_len, pi_sites_per = self.calculate_parsimony_informative_sites( + alignment + ) if (pi_sites, aln_len, pi_sites_per): print(f"{pi_sites}\t{aln_len}\t{round(pi_sites_per, 4)}") - def process_args(self, args): return dict(alignment_file_path=args.alignment) - def get_number_of_occurences_per_character( - self, - alignment, - idx: int - ) -> dict: - + def get_number_of_occurences_per_character(self, alignment, idx: int) -> dict: # obtain sequence at position, remove gaps, and make # all characters uppercase seq_at_position = "" @@ -35,38 +28,31 @@ def get_number_of_occurences_per_character( return num_occurences - def count_if_parsimony_informative( - self, - num_occurences: dict, - pi_sites: int - ): + def count_if_parsimony_informative(self, num_occurences: dict, pi_sites: int): # create a dictionary of characters that occur at least twice d = dict((k, v) for k, v in num_occurences.items() if v >= 2) # determine number of characters that occur at least twice if len(d) >= 2: - pi_sites += 1 - + pi_sites += 1 + return pi_sites - def calculate_parsimony_informative_sites( - self, - alignment - ): + def calculate_parsimony_informative_sites(self, alignment): # get aln length aln_len = alignment.get_alignment_length() - + pi_sites = 0 # count number of parsimony informative sites for idx in range(0, aln_len, int(1)): # count occurneces of each character at site idx - num_occurences = self.get_number_of_occurences_per_character(alignment, idx) + num_occurences = self.get_number_of_occurences_per_character(alignment, idx) # add one to pi_sites if site idx is parsimony informative pi_sites = self.count_if_parsimony_informative(num_occurences, pi_sites) # calculate percent of variable sites - pi_sites_per = (pi_sites / aln_len)*100 - - return pi_sites, aln_len, pi_sites_per \ No newline at end of file + pi_sites_per = (pi_sites / aln_len) * 100 + + return pi_sites, aln_len, pi_sites_per diff --git a/phykit/services/alignment/rcv.py b/phykit/services/alignment/rcv.py index d063b2f..4968bcd 100644 --- a/phykit/services/alignment/rcv.py +++ b/phykit/services/alignment/rcv.py @@ -1,8 +1,6 @@ -from Bio import SeqIO -from Bio.Seq import Seq - from .base import Alignment + class RelativeCompositionVariability(Alignment): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) @@ -12,6 +10,5 @@ def run(self): relative_composition_variability = self.calculate_rcv() print(round(relative_composition_variability, 4)) - def process_args(self, args): - return dict(alignment_file_path=args.alignment) \ No newline at end of file + return dict(alignment_file_path=args.alignment) diff --git a/phykit/services/alignment/rename_fasta_entries.py b/phykit/services/alignment/rename_fasta_entries.py index d404b78..cb27a80 100644 --- a/phykit/services/alignment/rename_fasta_entries.py +++ b/phykit/services/alignment/rename_fasta_entries.py @@ -4,6 +4,7 @@ from .base import Alignment + class RenameFastaEntries(Alignment): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) @@ -12,7 +13,7 @@ def run(self): # create biopython object of sequences try: records = SeqIO.parse(self.fasta, "fasta") - except FileNotFoundError: + except FileNotFoundError: print(f"{self.fasta} corresponds to no such file or directory.") print("Please double check pathing and filenames") sys.exit() @@ -23,7 +24,6 @@ def run(self): # replace and write out self.replace_ids_and_write(self.output_file_path, records, idmap) - def process_args(self, args): if args.output is None: output_file_path = f"{args.fasta}.renamed.fa" @@ -31,9 +31,7 @@ def process_args(self, args): output_file_path = f"{args.output}" return dict( - fasta=args.fasta, - idmap=args.idmap, - output_file_path=output_file_path + fasta=args.fasta, idmap=args.idmap, output_file_path=output_file_path ) def replace_ids_and_write(self, output_file_path, records, idmap): @@ -42,22 +40,20 @@ def replace_ids_and_write(self, output_file_path, records, idmap): replace that tip name with the value in the idmap and write to output file """ - with open(output_file_path, 'w') as output_file_path: + with open(output_file_path, "w") as output_file_path: for record in records: if record.id in idmap: # replace ID record.id = idmap[record.id] # remove description - record.description = '' + record.description = "" SeqIO.write(record, output_file_path, "fasta") - - - def idmap_to_dictionary(self, idmap:str) -> dict: + def idmap_to_dictionary(self, idmap: str) -> dict: """ read idmap into a dictionary """ - idmap={} + idmap = {} try: with open(self.idmap) as identifiers: for line in identifiers: @@ -68,4 +64,3 @@ def idmap_to_dictionary(self, idmap:str) -> dict: print(f"{self.idmap} corresponds to no such file or directory.") print("Please double check pathing and filenames") sys.exit() - diff --git a/phykit/services/alignment/sum_of_pairs_score.py b/phykit/services/alignment/sum_of_pairs_score.py index 6a09335..5a3f3dc 100644 --- a/phykit/services/alignment/sum_of_pairs_score.py +++ b/phykit/services/alignment/sum_of_pairs_score.py @@ -1,17 +1,17 @@ from collections import Counter import itertools -import sys from Bio import SeqIO from .base import Alignment + class SumOfPairsScore(Alignment): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) def run(self): - # create biopython object of sequences for query and + # create biopython object of sequences for query and # reference alignments query_records = SeqIO.to_dict(SeqIO.parse(self.fasta, "fasta")) reference_records = SeqIO.to_dict(SeqIO.parse(self.reference, "fasta")) @@ -20,26 +20,21 @@ def run(self): record_id_pairs = self.get_record_ids(reference_records) # calculate how many matches there are and how many total pairs there are - number_of_matches, number_of_total_pairs = self.determine_number_of_matches_and_total_pairs( - record_id_pairs, - reference_records, - query_records + ( + number_of_matches, + number_of_total_pairs, + ) = self.determine_number_of_matches_and_total_pairs( + record_id_pairs, reference_records, query_records ) - + # print res - print(round(number_of_matches/number_of_total_pairs, 4)) + print(round(number_of_matches / number_of_total_pairs, 4)) def process_args(self, args) -> dict: - return dict( - fasta=args.fasta, - reference=args.reference - ) + return dict(fasta=args.fasta, reference=args.reference) - def get_record_ids( - self, - reference_records: dict - ) -> list: - # loop through record names and save each to + def get_record_ids(self, reference_records: dict) -> list: + # loop through record names and save each to record_ids = [] for entry_name in reference_records: record_ids.append(entry_name) @@ -48,10 +43,7 @@ def get_record_ids( return record_id_pairs def determine_number_of_matches_and_total_pairs( - self, - record_id_pairs: list, - reference_records: dict, - query_records: dict + self, record_id_pairs: list, reference_records: dict, query_records: dict ): number_of_matches = 0 number_of_total_pairs = 0 @@ -59,23 +51,26 @@ def determine_number_of_matches_and_total_pairs( for record_pair in record_id_pairs: first_in_pair = record_pair[0] second_in_pair = record_pair[1] - + pairs_in_reference = [] pairs_in_query = [] # for each pair, loop through the length of the alignment and get sequence at each site for i in range(0, len(reference_records[first_in_pair].seq)): - pairs_in_reference.append(reference_records[first_in_pair].seq[i] + reference_records[second_in_pair].seq[i]) + pairs_in_reference.append( + reference_records[first_in_pair].seq[i] + + reference_records[second_in_pair].seq[i] + ) for i in range(0, len(query_records[first_in_pair].seq)): - pairs_in_query.append(query_records[first_in_pair].seq[i] + query_records[second_in_pair].seq[i]) + pairs_in_query.append( + query_records[first_in_pair].seq[i] + + query_records[second_in_pair].seq[i] + ) # count the number of matches and total pairs - matches = list((Counter(pairs_in_reference) & Counter(pairs_in_query)).elements()) - number_of_matches+=len(matches) - number_of_total_pairs+=len(pairs_in_reference) + matches = list( + (Counter(pairs_in_reference) & Counter(pairs_in_query)).elements() + ) + number_of_matches += len(matches) + number_of_total_pairs += len(pairs_in_reference) return number_of_matches, number_of_total_pairs - - - - - diff --git a/phykit/services/alignment/variable_sites.py b/phykit/services/alignment/variable_sites.py index c18b64a..4591948 100644 --- a/phykit/services/alignment/variable_sites.py +++ b/phykit/services/alignment/variable_sites.py @@ -1,7 +1,6 @@ -from Bio.Align import MultipleSeqAlignment - from .base import Alignment + class VariableSites(Alignment): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) @@ -12,13 +11,12 @@ def run(self): if (var_sites, aln_len, var_sites_per): print(f"{var_sites}\t{aln_len}\t{round(var_sites_per, 4)}") - def process_args(self, args): return dict(alignment_file_path=args.alignment) def calculate_variable_sites(self, alignment): aln_len = alignment.get_alignment_length() - + var_sites = 0 # count number of variable sites for i in range(0, aln_len, int(1)): @@ -30,6 +28,6 @@ def calculate_variable_sites(self, alignment): if len(set(seq_at_position)) > 1: var_sites += 1 # calculate percent of variable sites - var_sites_per = (var_sites / aln_len)*100 - - return var_sites, aln_len, var_sites_per \ No newline at end of file + var_sites_per = (var_sites / aln_len) * 100 + + return var_sites, aln_len, var_sites_per diff --git a/phykit/services/tree/__init__.py b/phykit/services/tree/__init__.py index b378735..d2d406c 100644 --- a/phykit/services/tree/__init__.py +++ b/phykit/services/tree/__init__.py @@ -26,4 +26,4 @@ from .tip_to_tip_distance import TipToTipDistance from .tip_to_tip_node_distance import TipToTipNodeDistance from .total_tree_length import TotalTreeLength -from .collapse_branches import CollapseBranches \ No newline at end of file +from .collapse_branches import CollapseBranches diff --git a/phykit/services/tree/base.py b/phykit/services/tree/base.py index ac723ff..91e55e5 100644 --- a/phykit/services/tree/base.py +++ b/phykit/services/tree/base.py @@ -1,10 +1,10 @@ import sys -from typing import Tuple from Bio import Phylo from ..base import BaseService + class Tree(BaseService): def __init__( self, @@ -56,7 +56,6 @@ def read_tree_file(self): print("Please check filename and pathing") sys.exit() - def read_tree1_file(self): try: return Phylo.read(self.tree1_file_path, self.tree_format) @@ -65,7 +64,6 @@ def read_tree1_file(self): print("Please check filename and pathing") sys.exit() - def read_reference_tree_file(self): try: return Phylo.read(self.reference, self.tree_format) @@ -74,27 +72,20 @@ def read_reference_tree_file(self): print("Please check filename and pathing") sys.exit() - def write_tree_file(self, tree, output_file_path): return Phylo.write(tree, output_file_path, self.tree_format) - def get_tip_names_from_tree( - self, tree - ) -> list: + def get_tip_names_from_tree(self, tree) -> list: """ get tip names from a tree """ tips = [] for tip in tree.get_terminals(): tips.append(tip.name) - + return tips - def shared_tips( - self, - a, - b - ): + def shared_tips(self, a, b): """ Determines what tips are shared between two trees ------------------------------------------------- @@ -102,29 +93,25 @@ def shared_tips( list of tips from one tree argv: b list of tips from a second tree - """ - - a_set = set(a) - b_set = set(b) - - # check length - if len(a_set.intersection(b_set)) > 0: - return(list(a_set.intersection(b_set))) - else: - print("no common tips") + """ + + a_set = set(a) + b_set = set(b) + + # check length + if len(a_set.intersection(b_set)) > 0: + return list(a_set.intersection(b_set)) + else: + print("no common tips") sys.exit() - def prune_tree_using_taxa_list( - self, - tree, - taxa_to_prune: list - ): + def prune_tree_using_taxa_list(self, tree, taxa_to_prune: list): """ prune taxa from tree """ for taxon in taxa_to_prune: tree.prune(taxon) - + return tree def calculate_treeness(self, tree=None, print_value=False): diff --git a/phykit/services/tree/bipartition_support_stats.py b/phykit/services/tree/bipartition_support_stats.py index c9befa4..9f8bc80 100644 --- a/phykit/services/tree/bipartition_support_stats.py +++ b/phykit/services/tree/bipartition_support_stats.py @@ -1,15 +1,9 @@ -import getopt -import logging -import math -import os.path -import statistics as stat -import sys - -from Bio import Phylo -import numpy as np - from .base import Tree -from ...helpers.stats_summary import calculate_summary_statistics_from_arr, print_summary_statistics +from ...helpers.stats_summary import ( + calculate_summary_statistics_from_arr, + print_summary_statistics, +) + class BipartitionSupportStats(Tree): def __init__(self, args) -> None: @@ -17,7 +11,7 @@ def __init__(self, args) -> None: def run(self): tree = self.read_tree_file() - bs_vals = self.get_bipartition_support_vals(tree) + bs_vals = self.get_bipartition_support_vals(tree) if self.verbose: try: @@ -26,7 +20,7 @@ def run(self): except BrokenPipeError: pass else: - stats = calculate_summary_statistics_from_arr(bs_vals) + stats = calculate_summary_statistics_from_arr(bs_vals) print_summary_statistics(stats) def process_args(self, args): @@ -41,4 +35,4 @@ def get_bipartition_support_vals(self, tree): # only include if a bootstrap value is present if terminal.confidence != None: bs_vals.append(terminal.confidence) - return bs_vals \ No newline at end of file + return bs_vals diff --git a/phykit/services/tree/branch_length_multiplier.py b/phykit/services/tree/branch_length_multiplier.py index 1bd4408..1904927 100644 --- a/phykit/services/tree/branch_length_multiplier.py +++ b/phykit/services/tree/branch_length_multiplier.py @@ -1,12 +1,6 @@ -import getopt -import logging -import os.path -import sys - -from Bio import Phylo - from .base import Tree + class BranchLengthMultiplier(Tree): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) @@ -25,14 +19,14 @@ def process_args(self, args): return dict( tree_file_path=args.tree, factor=args.factor, - output_file_path=output_file_path + output_file_path=output_file_path, ) def multiply_branch_lengths_by_factor(self, tree, factor): for internode in tree.get_nonterminals(): if internode.branch_length != None: - internode.branch_length = internode.branch_length*factor + internode.branch_length = internode.branch_length * factor for term in tree.get_terminals(): if term.branch_length != None: - term.branch_length = term.branch_length*factor + term.branch_length = term.branch_length * factor return tree diff --git a/phykit/services/tree/collapse_branches.py b/phykit/services/tree/collapse_branches.py index af8170c..3babca5 100644 --- a/phykit/services/tree/collapse_branches.py +++ b/phykit/services/tree/collapse_branches.py @@ -1,19 +1,15 @@ -import getopt -import logging -import os.path -import sys - -from Bio import Phylo - from .base import Tree + class CollapseBranches(Tree): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) def run(self): tree = self.read_tree_file() - tree.collapse_all(lambda c: c.confidence is not None and c.confidence < self.support) + tree.collapse_all( + lambda c: c.confidence is not None and c.confidence < self.support + ) self.write_tree_file(tree, self.output_file_path) def process_args(self, args): @@ -25,4 +21,4 @@ def process_args(self, args): tree_file_path=args.tree, support=args.support, output_file_path=output_file_path, - ) \ No newline at end of file + ) diff --git a/phykit/services/tree/covarying_evolutionary_rates.py b/phykit/services/tree/covarying_evolutionary_rates.py index 32173f9..91685d9 100644 --- a/phykit/services/tree/covarying_evolutionary_rates.py +++ b/phykit/services/tree/covarying_evolutionary_rates.py @@ -1,7 +1,6 @@ import copy -import sys -from scipy.stats import (pearsonr, zscore) +from scipy.stats import pearsonr, zscore from .base import Tree @@ -15,9 +14,9 @@ def run(self): tree_one = self.read_tree1_file() tree_ref = self.read_reference_tree_file() - ## - Calculate correlation between two gene trees - ## and save results to an array, corrArr. - ## - Branch lengths will also be part of output + # - Calculate correlation between two gene trees + # and save results to an array, corrArr. + # - Branch lengths will also be part of output # get tree tip names tree_zero_tips = self.get_tip_names_from_tree(tree_zero) @@ -40,15 +39,27 @@ def run(self): # obtain corrected branch lengths where branch lengths # are corrected by the species tree branch length - tree_zero_corr_branch_lengths, tree_one_corr_branch_lengths, tip_names = self.correct_branch_lengths(tree_zero, tree_one, tree_ref) + ( + tree_zero_corr_branch_lengths, + tree_one_corr_branch_lengths, + tip_names, + ) = self.correct_branch_lengths(tree_zero, tree_one, tree_ref) # remove corrected BLs greater than 5 outlier_indices = [] - outlier_indices = self.get_indices_of_outlier_branch_lengths(tree_zero_corr_branch_lengths, outlier_indices) - outlier_indices = self.get_indices_of_outlier_branch_lengths(tree_one_corr_branch_lengths, outlier_indices) + outlier_indices = self.get_indices_of_outlier_branch_lengths( + tree_zero_corr_branch_lengths, outlier_indices + ) + outlier_indices = self.get_indices_of_outlier_branch_lengths( + tree_one_corr_branch_lengths, outlier_indices + ) - tree_zero_corr_branch_lengths = self.remove_outliers_based_on_indices(tree_zero_corr_branch_lengths, outlier_indices) - tree_one_corr_branch_lengths = self.remove_outliers_based_on_indices(tree_one_corr_branch_lengths, outlier_indices) + tree_zero_corr_branch_lengths = self.remove_outliers_based_on_indices( + tree_zero_corr_branch_lengths, outlier_indices + ) + tree_one_corr_branch_lengths = self.remove_outliers_based_on_indices( + tree_one_corr_branch_lengths, outlier_indices + ) tip_names = self.remove_outliers_based_on_indices(tip_names, outlier_indices) # standardize values for final correction @@ -57,12 +68,20 @@ def run(self): # Calculate correlation and append to results array # also keep a list of p values - corr = (list(pearsonr(tree_zero_corr_branch_lengths, tree_one_corr_branch_lengths))) + corr = list( + pearsonr(tree_zero_corr_branch_lengths, tree_one_corr_branch_lengths) + ) try: if self.verbose: - for val_zero, val_one, tip_name in zip(tree_zero_corr_branch_lengths, tree_one_corr_branch_lengths, tip_names): - print(f"{round(val_zero, 4)}\t{round(val_one, 4)}\t{';'.join(tip_name)}") + for val_zero, val_one, tip_name in zip( + tree_zero_corr_branch_lengths, + tree_one_corr_branch_lengths, + tip_names, + ): + print( + f"{round(val_zero, 4)}\t{round(val_one, 4)}\t{';'.join(tip_name)}" + ) else: print(f"{round(corr[0], 4)}\t{round(corr[1], 6)}") except BrokenPipeError: @@ -73,20 +92,20 @@ def process_args(self, args): tree_file_path=args.tree_zero, tree1_file_path=args.tree_one, reference=args.reference, - verbose=args.verbose + verbose=args.verbose, ) def get_indices_of_outlier_branch_lengths( self, corr_branch_lengths, outlier_indices ): """ - create index for branch lengths that + create index for branch lengths that have an absolute value greater than 5 """ - for idx in range(0, len(corr_branch_lengths)): + for idx in range(0, len(corr_branch_lengths)): try: if corr_branch_lengths[idx] > 5 or corr_branch_lengths[idx] < -5: - if idx not in outlier_indices: + if idx not in outlier_indices: outlier_indices.append(idx) except TypeError: outlier_indices.append(idx) @@ -98,12 +117,12 @@ def remove_outliers_based_on_indices(self, corr_branch_lengths, outlier_indices) remove value if the value is an outlier according to the outlier indices list """ - corr_branch_lengths = [i for j, i in enumerate(corr_branch_lengths) if j not in outlier_indices] + corr_branch_lengths = [ + i for j, i in enumerate(corr_branch_lengths) if j not in outlier_indices + ] return corr_branch_lengths - def prune_tips( - self, tree, tips - ): + def prune_tips(self, tree, tips): """ prune tips from trees """ @@ -113,12 +132,7 @@ def prune_tips( return tree - def correct_branch_lengths( - self, - t0, - t1, - sp - ): + def correct_branch_lengths(self, t0, t1, sp): """ obtain a list of corrected branch lengths """ @@ -141,7 +155,7 @@ def correct_branch_lengths( for i in sp.get_nonterminals(): newtree = copy.deepcopy(t0) newtree1 = copy.deepcopy(t1) - sp_tips = self.get_tip_names_from_tree(i) + sp_tips = self.get_tip_names_from_tree(i) newtree = newtree.common_ancestor(sp_tips) newtree1 = newtree1.common_ancestor(sp_tips) try: diff --git a/phykit/services/tree/dvmc.py b/phykit/services/tree/dvmc.py index 4b543f0..84b8de6 100644 --- a/phykit/services/tree/dvmc.py +++ b/phykit/services/tree/dvmc.py @@ -1,20 +1,11 @@ -import getopt -import logging import math -import os.path -import statistics as stat -import sys from typing import Tuple -from Bio import Phylo -from Bio.Phylo.BaseTree import TreeMixin -import itertools -import numpy as np - from .base import Tree from ...helpers.files import read_single_column_file_to_list + class DVMC(Tree): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) @@ -23,17 +14,14 @@ def run(self): tree = self.read_tree_file() outgroup = read_single_column_file_to_list(self.outgroup_taxa_file_path) dvmc = self.determine_dvmc(tree, outgroup) - - print(round(dvmc, 4)) + print(round(dvmc, 4)) def process_args(self, args): return dict(tree_file_path=args.tree, outgroup_taxa_file_path=args.root) def get_names_of_outgroup_taxa_that_are_present( - self, - outgroup: list, - tree: Tree + self, outgroup: list, tree: Tree ) -> list: # initialize list for outgroup taxa that are present in tree out_pres = [] @@ -45,14 +33,11 @@ def get_names_of_outgroup_taxa_that_are_present( return out_pres - def get_term_to_root_dist_and_sum_of_distances( - self, - tree - ) -> Tuple[float, float]: + def get_term_to_root_dist_and_sum_of_distances(self, tree) -> Tuple[float, float]: """ calculate root to tip distances and the sum of root to tip distances - """ + """ dist = [] sum_dist = 0 for term in tree.get_terminals(): @@ -63,28 +48,23 @@ def get_term_to_root_dist_and_sum_of_distances( return dist, sum_dist - def calculate_dvmc( - self, - dist: float, - sum_dist: float, - num_spp: int - ) -> float: + def calculate_dvmc(self, dist: float, sum_dist: float, num_spp: int) -> float: """ calculate dvmc from tip to root distances """ # determine dvmc # calculate average tree distance - avg_dist = sum_dist/num_spp + avg_dist = sum_dist / num_spp # determine the sum of i=1 to N for (x_i-x_bar)^2 sumi2N = float(0.0) for x_i in dist: - sumi2N += ((x_i-avg_dist)**2) + sumi2N += (x_i - avg_dist) ** 2 # multiply sumi2N by 1/(N-1) where N is the number of spp # and then take the square root dvmc = float(0.0) - dvmc = math.sqrt((1/(num_spp-1))*sumi2N) + dvmc = math.sqrt((1 / (num_spp - 1)) * sumi2N) return dvmc @@ -98,7 +78,7 @@ def determine_dvmc(self, tree: Tree, outgroup: list): # prune outgroup taxa from tree tree = self.prune_tree_using_taxa_list(tree, out_pres) - # loop through terminal branches and store + # loop through terminal branches and store # distances from the root to the tip in a list. # Also, calc the sum of all tip to root distances dist, sum_dist = self.get_term_to_root_dist_and_sum_of_distances(tree) @@ -108,4 +88,3 @@ def determine_dvmc(self, tree: Tree, outgroup: list): # calculate and return dvmc return self.calculate_dvmc(dist, sum_dist, num_spp) - \ No newline at end of file diff --git a/phykit/services/tree/evolutionary_rate.py b/phykit/services/tree/evolutionary_rate.py index 66f2702..715baaa 100644 --- a/phykit/services/tree/evolutionary_rate.py +++ b/phykit/services/tree/evolutionary_rate.py @@ -9,8 +9,7 @@ def run(self): tree = self.read_tree_file() total_tree_length = self.calculate_total_tree_length(tree) num_terminals = tree.count_terminals() - print(round(total_tree_length/num_terminals, 4)) - + print(round(total_tree_length / num_terminals, 4)) def process_args(self, args): return dict(tree_file_path=args.tree) diff --git a/phykit/services/tree/hidden_paralogy_check.py b/phykit/services/tree/hidden_paralogy_check.py index 535a8ed..f3c9f57 100644 --- a/phykit/services/tree/hidden_paralogy_check.py +++ b/phykit/services/tree/hidden_paralogy_check.py @@ -26,7 +26,7 @@ def run(self): clade_of_interest = list(set(clade).intersection(tree_tips)) # check for sufficient representation - if len(clade_of_interest)<=1: + if len(clade_of_interest) <= 1: res_arr.append(["insufficient_taxon_representation"]) continue @@ -39,16 +39,15 @@ def run(self): # get common ancestor of clan of interest tree = tree.common_ancestor(shared_tree_tips) # get common ancestor tree tips - common_ancestor_tips = self.get_tip_names_from_tree(tree) - diff_tips_between_clade_and_curr_tree = list(set(clade_of_interest) ^ set(common_ancestor_tips)) + common_ancestor_tips = self.get_tip_names_from_tree(tree) + diff_tips_between_clade_and_curr_tree = list( + set(clade_of_interest) ^ set(common_ancestor_tips) + ) stats = self.get_bootstrap_statistics(tree) res_arr = self.populate_res_arr( - shared_tree_tips, - diff_tips_between_clade_and_curr_tree, - stats, - res_arr + shared_tree_tips, diff_tips_between_clade_and_curr_tree, stats, res_arr ) self.print_results(res_arr) @@ -60,22 +59,22 @@ def process_args(self, args): tree_file_path=tree_file_path, clade=args.clade, ) - + def read_clades_file(self, clades): try: clades = [[s for s in l.split()] for l in open(self.clade).readlines()] except FileNotFoundError: print("Clade file is not found. Please check pathing.") sys.exit() - + return clades - + def get_bootstrap_statistics(self, tree): # get bootstrap support values bs_vals = [] # populate bs_vals with bootstrap values for terminal in tree.get_nonterminals(): - # only include if a bootstrap value is present + # only include if a bootstrap value is present if terminal.confidence != None: bs_vals.append(terminal.confidence) stats = calculate_summary_statistics_from_arr(bs_vals) @@ -83,13 +82,8 @@ def get_bootstrap_statistics(self, tree): return stats def populate_res_arr( - self, - shared_tree_tips, - diff_tips_between_clade_and_curr_tree, - stats, - res_arr + self, shared_tree_tips, diff_tips_between_clade_and_curr_tree, stats, res_arr ): - temp = [] if len(diff_tips_between_clade_and_curr_tree) == 0: @@ -110,8 +104,12 @@ def print_results(self, res_arr): try: if len(res[5]) != 0: res[5].sort() - print(f"{res[0]}\t{round(res[1], 4)}\t{round(res[2], 4)}\t{round(res[3], 4)}\t{round(res[4], 4)}\t{';'.join(res[5])}") + print( + f"{res[0]}\t{round(res[1], 4)}\t{round(res[2], 4)}\t{round(res[3], 4)}\t{round(res[4], 4)}\t{';'.join(res[5])}" + ) else: - print(f"{res[0]}\t{round(res[1], 4)}\t{round(res[2], 4)}\t{round(res[3], 4)}\t{round(res[4], 4)}") + print( + f"{res[0]}\t{round(res[1], 4)}\t{round(res[2], 4)}\t{round(res[3], 4)}\t{round(res[4], 4)}" + ) except IndexError: - print(f"{res[0]}") \ No newline at end of file + print(f"{res[0]}") diff --git a/phykit/services/tree/internal_branch_stats.py b/phykit/services/tree/internal_branch_stats.py index 9025027..9c9ff38 100644 --- a/phykit/services/tree/internal_branch_stats.py +++ b/phykit/services/tree/internal_branch_stats.py @@ -2,7 +2,11 @@ from .base import Tree -from ...helpers.stats_summary import calculate_summary_statistics_from_arr, print_summary_statistics +from ...helpers.stats_summary import ( + calculate_summary_statistics_from_arr, + print_summary_statistics, +) + class InternalBranchStats(Tree): def __init__(self, args) -> None: @@ -15,7 +19,7 @@ def run(self): if self.verbose: try: for len_and_name in lengths_and_names: - print(round(len_and_name[0], 4), ';'.join(len_and_name[1])) + print(round(len_and_name[0], 4), ";".join(len_and_name[1])) except BrokenPipeError: pass else: @@ -41,25 +45,28 @@ def get_internal_branch_lengths(self, tree) -> list: temp.append(temp_name) lengths_and_names.append(temp) - return internal_branch_lengths, lengths_and_names + return internal_branch_lengths, lengths_and_names - def check_tree_has_branch_lengths(self, internal_branch_lengths:list) -> None: + def check_tree_has_branch_lengths(self, internal_branch_lengths: list) -> None: """ if tree has no branch lengths, exit """ if len(internal_branch_lengths) == 0: - print("Calculating internal branch statistics requires a phylogeny with branch lengths.") + print( + "Calculating internal branch statistics requires a phylogeny with branch lengths." + ) sys.exit() - def calculate_internal_branch_stats(self, tree): # save internal branch lengths to internal_branch_lengths - internal_branch_lengths, lengths_and_names = self.get_internal_branch_lengths(tree) - + internal_branch_lengths, lengths_and_names = self.get_internal_branch_lengths( + tree + ) + # If the phylogeny had no branch lengths, inform user and quit self.check_tree_has_branch_lengths(internal_branch_lengths) - + # calculate summary stats stats = calculate_summary_statistics_from_arr(internal_branch_lengths) - return internal_branch_lengths, stats, lengths_and_names \ No newline at end of file + return internal_branch_lengths, stats, lengths_and_names diff --git a/phykit/services/tree/last_common_ancestor_subtree.py b/phykit/services/tree/last_common_ancestor_subtree.py index c0c6483..3c0426d 100644 --- a/phykit/services/tree/last_common_ancestor_subtree.py +++ b/phykit/services/tree/last_common_ancestor_subtree.py @@ -1,7 +1,5 @@ import sys -from Bio import Phylo - from .base import Tree from ...helpers.files import read_single_column_file_to_list diff --git a/phykit/services/tree/lb_score.py b/phykit/services/tree/lb_score.py index 78d7afe..86602bb 100644 --- a/phykit/services/tree/lb_score.py +++ b/phykit/services/tree/lb_score.py @@ -1,16 +1,12 @@ import sys -import getopt -import os.path -import statistics as stat - -from Bio import Phylo -from Bio.Phylo.BaseTree import TreeMixin import itertools -import numpy as np from .base import Tree -from ...helpers.stats_summary import calculate_summary_statistics_from_arr, print_summary_statistics +from ...helpers.stats_summary import ( + calculate_summary_statistics_from_arr, + print_summary_statistics, +) class LBScore(Tree): @@ -33,11 +29,7 @@ def run(self): def process_args(self, args): return dict(tree_file_path=args.tree, verbose=args.verbose) - def calculate_average_distance_between_tips( - self, - tips: list, - tree - ) -> float: + def calculate_average_distance_between_tips(self, tips: list, tree) -> float: # determine pairwise combinations of tips combos = list(itertools.combinations(tips, 2)) @@ -45,14 +37,12 @@ def calculate_average_distance_between_tips( # avg_dist is PDa total_dist = float() for combo in combos: - total_dist+=tree.distance(combo[0], combo[1]) - - return total_dist/len(combos) + total_dist += tree.distance(combo[0], combo[1]) + + return total_dist / len(combos) def calculate_average_distance_of_taxon_to_other_taxa( - self, - tips: list, - tree + self, tips: list, tree ) -> list: """ calculate average distance of taxon to all other taxon or average PDi. @@ -62,38 +52,34 @@ def calculate_average_distance_of_taxon_to_other_taxa( for tip in tips: tips_minus_i = list(set(tips) - set(tip)) PDi = [] - for tip_minus in tips_minus_i: + for tip_minus in tips_minus_i: PDi.append(tree.distance(tip, tip_minus)) PDi = sum(PDi) / len(PDi) avg_PDis.append(PDi) return avg_PDis - def calculate_lb_score_per_taxa( - self, - avg_PDis:list, - avg_dist:float - ) -> list: + def calculate_lb_score_per_taxa(self, avg_PDis: list, avg_dist: float) -> list: """ create a list with the lb scores for each taxon """ LBis = [] for PDi in avg_PDis: try: - LBis.append((((PDi/avg_dist)-1)*100)) + LBis.append((((PDi / avg_dist) - 1) * 100)) except ZeroDivisionError: try: print("Invalid tree. Tree should contain branch lengths") sys.exit() except BrokenPipeError: pass - + return LBis def calculate_lb_score(self, tree): # get tree tips tips = self.get_tip_names_from_tree(tree) - + # get average distance between tips avg_dist = self.calculate_average_distance_between_tips(tips, tree) @@ -103,5 +89,5 @@ def calculate_lb_score(self, tree): # use PDis and avgDist to calculate LB values for each taxon LBis = self.calculate_lb_score_per_taxa(avg_PDis, avg_dist) - + return tips, LBis diff --git a/phykit/services/tree/monophyly_check.py b/phykit/services/tree/monophyly_check.py index b05c9dc..d54e217 100644 --- a/phykit/services/tree/monophyly_check.py +++ b/phykit/services/tree/monophyly_check.py @@ -24,7 +24,7 @@ def run(self): taxa_of_interest = list(set(taxa).intersection(tree_tips)) # check for sufficient representation - if len(taxa_of_interest)<=1: + if len(taxa_of_interest) <= 1: res_arr.append(["insufficient_taxon_representation"]) sys.exit() @@ -37,15 +37,15 @@ def run(self): # get common ancestor of clan of interest tree = tree.common_ancestor(shared_tree_tips) # get common ancestor tree tips - common_ancestor_tips = self.get_tip_names_from_tree(tree) - diff_tips_between_clade_and_curr_tree = list(set(taxa_of_interest) ^ set(common_ancestor_tips)) + common_ancestor_tips = self.get_tip_names_from_tree(tree) + diff_tips_between_clade_and_curr_tree = list( + set(taxa_of_interest) ^ set(common_ancestor_tips) + ) stats = self.get_bootstrap_statistics(tree) res_arr = self.populate_res_arr( - diff_tips_between_clade_and_curr_tree, - stats, - res_arr + diff_tips_between_clade_and_curr_tree, stats, res_arr ) self.print_results(res_arr) @@ -55,26 +55,20 @@ def process_args(self, args): tree_file_path=args.tree, list_of_taxa=args.list_of_taxa, ) - + def get_bootstrap_statistics(self, tree): # get bootstrap support values bs_vals = [] # populate bs_vals with bootstrap values for terminal in tree.get_nonterminals(): - # only include if a bootstrap value is present + # only include if a bootstrap value is present if terminal.confidence != None: bs_vals.append(terminal.confidence) stats = calculate_summary_statistics_from_arr(bs_vals) return stats - def populate_res_arr( - self, - diff_tips_between_clade_and_curr_tree, - stats, - res_arr - ): - + def populate_res_arr(self, diff_tips_between_clade_and_curr_tree, stats, res_arr): temp = [] if len(diff_tips_between_clade_and_curr_tree) == 0: @@ -95,8 +89,12 @@ def print_results(self, res_arr): try: if len(res[5]) != 0: res[5].sort() - print(f"{res[0]}\t{round(res[1], 4)}\t{round(res[2], 4)}\t{round(res[3], 4)}\t{round(res[4], 4)}\t{';'.join(res[5])}") + print( + f"{res[0]}\t{round(res[1], 4)}\t{round(res[2], 4)}\t{round(res[3], 4)}\t{round(res[4], 4)}\t{';'.join(res[5])}" + ) else: - print(f"{res[0]}\t{round(res[1], 4)}\t{round(res[2], 4)}\t{round(res[3], 4)}\t{round(res[4], 4)}") + print( + f"{res[0]}\t{round(res[1], 4)}\t{round(res[2], 4)}\t{round(res[3], 4)}\t{round(res[4], 4)}" + ) except IndexError: - print(f"{res[0]}") \ No newline at end of file + print(f"{res[0]}") diff --git a/phykit/services/tree/nearest_neighbor_interchange.py b/phykit/services/tree/nearest_neighbor_interchange.py index 827b45d..2a8f3fd 100644 --- a/phykit/services/tree/nearest_neighbor_interchange.py +++ b/phykit/services/tree/nearest_neighbor_interchange.py @@ -4,37 +4,35 @@ from .base import Tree + class NearestNeighborInterchange(Tree): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) def run(self): tree = self.read_tree_file() - + # remove branch lengths if specified by user # otherwise, print ascii tree all_nnis = [tree] for t in self.get_neighbors(tree): all_nnis.append(t) Phylo.write(all_nnis, self.output_file_path, "newick") - + def process_args(self, args): - tree_file_path=args.tree + tree_file_path = args.tree if args.output is None: output_file_path = f"{tree_file_path}.nnis" else: output_file_path = f"{args.output}" - return dict( - tree_file_path=tree_file_path, - output_file_path=output_file_path - ) + return dict(tree_file_path=tree_file_path, output_file_path=output_file_path) def get_neighbors(self, tree): - ### This code is from BioPython (so is this comment) - # Get all neighbor trees of the given tree (PRIVATE). - # Currently only for binary rooted trees. - ### + ### This code is from BioPython (so is this comment) + # Get all neighbor trees of the given tree (PRIVATE). + # Currently only for binary rooted trees. + ### # make child to parent dict parents = {} for clade in tree.find_clades(): diff --git a/phykit/services/tree/patristic_distances.py b/phykit/services/tree/patristic_distances.py index 8154ed3..a1edb94 100644 --- a/phykit/services/tree/patristic_distances.py +++ b/phykit/services/tree/patristic_distances.py @@ -1,17 +1,13 @@ -import sys -import getopt -import os.path -import statistics as stat from typing import Tuple - -from Bio import Phylo -from Bio.Phylo.BaseTree import TreeMixin import itertools -import numpy as np from .base import Tree -from ...helpers.stats_summary import calculate_summary_statistics_from_arr, print_summary_statistics +from ...helpers.stats_summary import ( + calculate_summary_statistics_from_arr, + print_summary_statistics, +) + class PatristicDistances(Tree): def __init__(self, args) -> None: @@ -46,11 +42,11 @@ def calculate_distance_between_pairs(self, tips: list, tree) -> Tuple[list, list def calculate_patristic_distances(self, tree): # get tree tips tips = self.get_tip_names_from_tree(tree) - + # get distances between pairs of taxa combos, patristic_distances = self.calculate_distance_between_pairs(tips, tree) - + # calculate summary stats stats = calculate_summary_statistics_from_arr(patristic_distances) - - return patristic_distances, combos, stats \ No newline at end of file + + return patristic_distances, combos, stats diff --git a/phykit/services/tree/polytomy_test.py b/phykit/services/tree/polytomy_test.py index 7fc453b..f814cd5 100644 --- a/phykit/services/tree/polytomy_test.py +++ b/phykit/services/tree/polytomy_test.py @@ -1,17 +1,14 @@ import sys -import getopt -import os.path import itertools from scipy.stats import chisquare from typing import Tuple from Bio import Phylo -from Bio.Phylo.BaseTree import TreeMixin from .base import Tree - from ...helpers.files import read_single_column_file_to_list + class PolytomyTest(Tree): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) @@ -26,24 +23,29 @@ def run(self): # read trees into list trees_file_path = read_single_column_file_to_list(self.trees) - # go through all triplets of all trees and + # go through all triplets of all trees and # examine sister relationships among all triplets summary = self.loop_through_trees_and_examine_sister_support_among_triplets( - trees_file_path, - groups_of_groups, - outgroup_taxa + trees_file_path, groups_of_groups, outgroup_taxa ) # count triplet and gene support frequencies for different sister relationships - triplet_group_counts, gene_support_freq = self.get_triplet_and_gene_support_freq_counts(summary) + ( + triplet_group_counts, + gene_support_freq, + ) = self.get_triplet_and_gene_support_freq_counts(summary) # conduct chisquare tests - triplet_res, gene_support_freq_res = self.chisquare_tests(triplet_group_counts, gene_support_freq) + triplet_res, gene_support_freq_res = self.chisquare_tests( + triplet_group_counts, gene_support_freq + ) # print results - self.print_gene_support_freq_res(gene_support_freq_res, gene_support_freq, trees_file_path) + self.print_gene_support_freq_res( + gene_support_freq_res, gene_support_freq, trees_file_path + ) # self.print_triplet_based_res(triplet_res, triplet_group_counts) - + def process_args(self, args): return dict(trees=args.trees, groups=args.groups) @@ -65,15 +67,19 @@ def read_in_groups(self, groups) -> list: except IndexError: try: print(f"{self.groups} contains an indexing error.") - print("Please format the groups file (-g) as a four column tab-delimited file with column 1 being the name of the test") + print( + "Please format the groups file (-g) as a four column tab-delimited file with column 1 being the name of the test" + ) print("col2: the tip names of one group (; separated)") print("col3: the tip names of a second group (; separated)") print("col4: the tip names of a third group (; separated)") - print("col5: the tip names of the outgroup taxa (; separated)") + print( + "col5: the tip names of the outgroup taxa (; separated)" + ) sys.exit() except BrokenPipeError: pass - + except FileNotFoundError: try: print(f"{self.groups} corresponds to no such file.") @@ -84,10 +90,7 @@ def read_in_groups(self, groups) -> list: return groups_arr def loop_through_trees_and_examine_sister_support_among_triplets( - self, - trees_file_path: str, - groups_of_groups: dict, - outgroup_taxa: list + self, trees_file_path: str, groups_of_groups: dict, outgroup_taxa: list ) -> dict: """ go through all trees and all triplets of all trees. For each triplet, @@ -96,22 +99,18 @@ def loop_through_trees_and_examine_sister_support_among_triplets( summary = {} # loop through trees try: - #cnt = 0 + # cnt = 0 for tree_file in trees_file_path: - #cnt+=1 - #print(f"processing tree {cnt} of {len(trees_file_path)}") - tree = Phylo.read(tree_file, 'newick') + # cnt+=1 + # print(f"processing tree {cnt} of {len(trees_file_path)}") + tree = Phylo.read(tree_file, "newick") # get tip names tips = self.get_tip_names_from_tree(tree) - # examine all triplets and their support for + # examine all triplets and their support for # any sister pairing summary = self.examine_all_triplets_and_sister_pairing( - tips, - tree_file, - summary, - groups_of_groups, - outgroup_taxa + tips, tree_file, summary, groups_of_groups, outgroup_taxa ) except FileNotFoundError: try: @@ -120,23 +119,20 @@ def loop_through_trees_and_examine_sister_support_among_triplets( sys.exit() except BrokenPipeError: pass - + return summary - def determine_groups_of_groups( - self, - groups_arr: list - ) -> dict: + def determine_groups_of_groups(self, groups_arr: list) -> dict: groups_of_groups = {} for group in groups_arr: temp = [] for i in range(1, 4): temp.append([taxon_name for taxon_name in group[i]]) - groups_of_groups[group[0]] = (temp) + groups_of_groups[group[0]] = temp outgroup_taxa = [taxon_name for taxon_name in group[4]] - + return groups_of_groups, outgroup_taxa def examine_all_triplets_and_sister_pairing( @@ -145,7 +141,7 @@ def examine_all_triplets_and_sister_pairing( tree_file: str, summary: dict, groups_of_groups: dict, - outgroup_taxa: list + outgroup_taxa: list, ) -> dict: """ evaluate all triplets for sister relationships. Polytomies @@ -153,23 +149,25 @@ def examine_all_triplets_and_sister_pairing( """ # get all combinations of three tips identifier = list(groups_of_groups.keys())[0] - triplet_tips = (list(itertools.product(*groups_of_groups[identifier]))) + triplet_tips = list(itertools.product(*groups_of_groups[identifier])) for triplet in triplet_tips: # obtain tree of the triplet tree = self.get_triplet_tree(tips, triplet, tree_file, outgroup_taxa) - cnt=0 + cnt = 0 try: for leaf in tree.get_terminals(): - cnt+=1 + cnt += 1 except AttributeError: continue - if tree and cnt==3: + if tree and cnt == 3: for _, groups in groups_of_groups.items(): # see if there any intersctions between # the triplet and the the group - num_groups_represented = self.count_number_of_groups_in_triplet(triplet, groups) - - # if one taxa is represented from each group, + num_groups_represented = self.count_number_of_groups_in_triplet( + triplet, groups + ) + + # if one taxa is represented from each group, # use the triplet tip_names = [] if num_groups_represented == 3: @@ -178,11 +176,7 @@ def examine_all_triplets_and_sister_pairing( self.set_branch_lengths_in_tree_to_one(tree) # determine sisters and add to sister pair counter summary = self.determine_sisters_and_add_to_counter( - tip_names, - tree, - tree_file, - groups, - summary + tip_names, tree, tree_file, groups, summary ) else: continue @@ -197,7 +191,7 @@ def count_number_of_groups_in_triplet(self, triplet: list, groups: tuple) -> int for group in groups: temp = [value for value in list(triplet) if value in group] if len(temp) >= 1: - num_groups_represented +=1 + num_groups_represented += 1 return num_groups_represented def set_branch_lengths_in_tree_to_one(self, tree: Tree) -> None: @@ -210,10 +204,10 @@ def check_if_triplet_is_a_polytomy(self, tree: Tree) -> Tree: """ count the number of internal branches. If 1, then the triplet is a polytomy """ - num_int=0 + num_int = 0 # check if the triplet is a polytomy for internal_branch in tree.get_nonterminals(): - num_int+=1 + num_int += 1 if num_int == 1: return True @@ -221,17 +215,14 @@ def check_if_triplet_is_a_polytomy(self, tree: Tree) -> Tree: return False def sister_relationship_counter( - self, - tree_file: str, - summary: dict, - sisters: str + self, tree_file: str, summary: dict, sisters: str ) -> dict: """ counter for how many times a particular sister relationship is observed """ # if tree is not in summary, create a key for it if tree_file not in summary.keys(): - summary[str(tree_file)]={} + summary[str(tree_file)] = {} # if the sister relationship is not in the tree file dict, create a key for it if sisters not in summary[str(tree_file)].keys(): summary[str(tree_file)][sisters] = 1 @@ -241,21 +232,17 @@ def sister_relationship_counter( return summary def get_triplet_tree( - self, - tips: list, - triplet: tuple, - tree_file: str, - outgroup_taxa: list + self, tips: list, triplet: tuple, tree_file: str, outgroup_taxa: list ) -> Tree: """ get a tree object of only the triplet of interest """ # determine tips that are not in the triplet of interest - tips_to_prune = list(set(tips)- set(list(triplet))) + tips_to_prune = list(set(tips) - set(list(triplet))) # determine tips that in the outgroup outgroup_present = [value for value in tips if value in outgroup_taxa] - tree = Phylo.read(tree_file, 'newick') - + tree = Phylo.read(tree_file, "newick") + # root tree on outgroup taxa try: tree.root_with_outgroup(outgroup_present) @@ -268,27 +255,23 @@ def get_triplet_tree( tree = False return tree - def determine_sisters_from_triplet( - self, - groups: list, - pair: tuple - ) -> str: + def determine_sisters_from_triplet(self, groups: list, pair: tuple) -> str: """ determine sister taxa from a triplet """ - sisters = sorted([[i for i, lst in enumerate(groups) if pair[0] in lst][0], [i for i, lst in enumerate(groups) if pair[1] in lst][0]]) - sisters = [str(i) for i in sisters] + sisters = sorted( + [ + [i for i, lst in enumerate(groups) if pair[0] in lst][0], + [i for i, lst in enumerate(groups) if pair[1] in lst][0], + ] + ) + sisters = [str(i) for i in sisters] sisters = str("-".join(sisters)) return sisters def determine_sisters_and_add_to_counter( - self, - tip_names: list, - tree: Tree, - tree_file: str, - groups: list, - summary: dict + self, tip_names: list, tree: Tree, tree_file: str, groups: list, summary: dict ) -> dict: """ determine which pair of taxa are sister to one another @@ -298,7 +281,7 @@ def determine_sisters_and_add_to_counter( pairs = list(itertools.combinations(tip_names, 2)) for pair in pairs: is_polytomy = self.check_if_triplet_is_a_polytomy(tree) - # if distance between pair is 2 and the triplet is + # if distance between pair is 2 and the triplet is # not a polytomy (i.e., having only 1 internal branch) # then report the sisters in the triplet if tree.distance(pair[0], pair[1]) == 2 and not is_polytomy: @@ -310,64 +293,51 @@ def determine_sisters_and_add_to_counter( return summary def get_triplet_and_gene_support_freq_counts( - self, - summary: dict + self, summary: dict ) -> Tuple[dict, dict]: """ count how many triplets and genes support the various sister relationships """ # Count the total number of sister pairings # for the three possible pairs for triplets - triplet_group_counts = { - 'g0g1_count' : 0, - 'g0g2_count' : 0, - 'g1g2_count' : 0 - } + triplet_group_counts = {"g0g1_count": 0, "g0g2_count": 0, "g1g2_count": 0} - # Also, keep track of which and how many genes + # Also, keep track of which and how many genes # support each sister pairing - gene_support_freq = { - '0-1' : 0, - '1-2' : 0, - '0-2' : 0 - } + gene_support_freq = {"0-1": 0, "1-2": 0, "0-2": 0} for tree in summary: - # create empty key value pairs in case sister + # create empty key value pairs in case sister # pairing was never observed - if '0-1' not in summary[tree].keys(): - summary[tree]['0-1'] = 0 - if '0-2' not in summary[tree].keys(): - summary[tree]['0-2'] = 0 - if '1-2' not in summary[tree].keys(): - summary[tree]['1-2'] = 0 + if "0-1" not in summary[tree].keys(): + summary[tree]["0-1"] = 0 + if "0-2" not in summary[tree].keys(): + summary[tree]["0-2"] = 0 + if "1-2" not in summary[tree].keys(): + summary[tree]["1-2"] = 0 # create a running value of triplets that support each sister pair - triplet_group_counts['g0g1_count'] += summary[tree]['0-1'] - triplet_group_counts['g0g2_count'] += summary[tree]['0-2'] - triplet_group_counts['g1g2_count'] += summary[tree]['1-2'] + triplet_group_counts["g0g1_count"] += summary[tree]["0-1"] + triplet_group_counts["g0g2_count"] += summary[tree]["0-2"] + triplet_group_counts["g1g2_count"] += summary[tree]["1-2"] # determine which sister pairing is best supported in a single gene # and add one to the corresponding gene support frequency count gene_support_freq[max(summary[tree], key=summary[tree].get)] += 1 return triplet_group_counts, gene_support_freq - def chisquare_tests( - self, - triplet_group_counts: dict, - gene_support_freq: dict - ): - + def chisquare_tests(self, triplet_group_counts: dict, gene_support_freq: dict): triplet_res = chisquare( [ - triplet_group_counts['g0g1_count'], - triplet_group_counts['g0g2_count'], - triplet_group_counts['g1g2_count'] + triplet_group_counts["g0g1_count"], + triplet_group_counts["g0g2_count"], + triplet_group_counts["g1g2_count"], ] ) - + gene_support_freq_res = chisquare( [ - gene_support_freq['0-1'], - gene_support_freq['0-2'],gene_support_freq['1-2'] + gene_support_freq["0-1"], + gene_support_freq["0-2"], + gene_support_freq["1-2"], ] ) @@ -394,10 +364,7 @@ def chisquare_tests( # pass def print_gene_support_freq_res( - self, - gene_support_freq_res, - gene_support_freq: dict, - trees_file_path: list + self, gene_support_freq_res, gene_support_freq: dict, trees_file_path: list ) -> None: """ print results to stdout for user @@ -407,7 +374,9 @@ def print_gene_support_freq_res( print(f"==============================") print(f"chi-squared: {round(gene_support_freq_res.statistic, 4)}") print(f"p-value: {round(gene_support_freq_res.pvalue, 6)}") - print(f"total genes: {(gene_support_freq['0-1'] + gene_support_freq['0-2'] + gene_support_freq['1-2'])}") + print( + f"total genes: {(gene_support_freq['0-1'] + gene_support_freq['0-2'] + gene_support_freq['1-2'])}" + ) print(f"0-1: {gene_support_freq['0-1']}") print(f"0-2: {gene_support_freq['0-2']}") print(f"1-2: {gene_support_freq['1-2']}") diff --git a/phykit/services/tree/print_tree.py b/phykit/services/tree/print_tree.py index 03c965e..374ad3f 100644 --- a/phykit/services/tree/print_tree.py +++ b/phykit/services/tree/print_tree.py @@ -1,16 +1,15 @@ -import logging - from Bio import Phylo from .base import Tree + class PrintTree(Tree): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) def run(self): tree = self.read_tree_file() - + # remove branch lengths if specified by user # otherwise, print ascii tree if self.remove: @@ -23,6 +22,6 @@ def run(self): Phylo.draw_ascii(tree) except BrokenPipeError: pass - + def process_args(self, args): - return dict(tree_file_path=args.tree, remove=args.remove) \ No newline at end of file + return dict(tree_file_path=args.tree, remove=args.remove) diff --git a/phykit/services/tree/prune_tree.py b/phykit/services/tree/prune_tree.py index 6574fad..947d68f 100644 --- a/phykit/services/tree/prune_tree.py +++ b/phykit/services/tree/prune_tree.py @@ -2,13 +2,14 @@ from ...helpers.files import read_single_column_file_to_list + class PruneTree(Tree): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) def run(self): tree = self.read_tree_file() - + taxa = read_single_column_file_to_list(self.list_of_taxa) if self.keep: @@ -16,11 +17,11 @@ def run(self): for term in tree.get_terminals(): tips_in_tree.append(term.name) taxa = [x for x in tips_in_tree if x not in taxa] - + tree = self.prune_tree_using_taxa_list(tree, taxa) self.write_tree_file(tree, self.output_file_path) - + def process_args(self, args): tree_file_path = args.tree if args.output is None: @@ -29,13 +30,13 @@ def process_args(self, args): output_file_path = f"{args.output}" if args.keep is None: - keep=True + keep = True else: - keep=args.keep + keep = args.keep return dict( tree_file_path=tree_file_path, list_of_taxa=args.list_of_taxa, output_file_path=output_file_path, - keep=keep - ) \ No newline at end of file + keep=keep, + ) diff --git a/phykit/services/tree/rename_tree_tips.py b/phykit/services/tree/rename_tree_tips.py index 005b79d..1d68b01 100644 --- a/phykit/services/tree/rename_tree_tips.py +++ b/phykit/services/tree/rename_tree_tips.py @@ -1,8 +1,8 @@ -import logging import sys from .base import Tree + class RenameTreeTips(Tree): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) @@ -12,11 +12,11 @@ def run(self): # save idmap to a dictionary idmap = self.read_id_map(self.idmap) - + tree = self.replace_tip_names(tree, idmap) - + self.write_tree_file(tree, self.output_file_path) - + def process_args(self, args): tree_file_path = args.tree @@ -26,16 +26,16 @@ def process_args(self, args): output_file_path = f"{args.output}" return dict( - tree_file_path=tree_file_path, + tree_file_path=tree_file_path, idmap=args.idmap, - output_file_path=output_file_path - ) + output_file_path=output_file_path, + ) def read_id_map(self, idmap: str) -> dict: """ read two column file to dictionary """ - idmap={} + idmap = {} try: with open(self.idmap) as identifiers: for line in identifiers: @@ -61,6 +61,5 @@ def replace_tip_names(self, tree: Tree, idmap: dict) -> Tree: for term in tree.get_terminals(): if term.name in idmap: term.name = idmap[term.name] - - return tree + return tree diff --git a/phykit/services/tree/rf_distance.py b/phykit/services/tree/rf_distance.py index d25ba44..e83a38e 100644 --- a/phykit/services/tree/rf_distance.py +++ b/phykit/services/tree/rf_distance.py @@ -1,13 +1,3 @@ -import sys -import getopt -import os.path -import statistics as stat - -from Bio import Phylo -from Bio.Phylo.BaseTree import TreeMixin -import itertools -import numpy as np - from .base import Tree @@ -30,17 +20,18 @@ def run(self): tree_zero = self.prune_tree_using_taxa_list(tree_zero, tree_zero_tips_to_prune) tree_one = self.prune_tree_using_taxa_list(tree_one, tree_one_tips_to_prune) - tip_for_rooting = '' + tip_for_rooting = "" for term in tree_zero.get_terminals(): tip_for_rooting = term.name break tree_zero.root_with_outgroup(tip_for_rooting) tree_one.root_with_outgroup(tip_for_rooting) - plain_rf, normalized_rf = self.calculate_robinson_foulds_distance(tree_zero, tree_one) - - print(f"{plain_rf}\t{round(normalized_rf, 4)}") + plain_rf, normalized_rf = self.calculate_robinson_foulds_distance( + tree_zero, tree_one + ) + print(f"{plain_rf}\t{round(normalized_rf, 4)}") def process_args(self, args): return dict(tree_file_path=args.tree_zero, tree1_file_path=args.tree_one) @@ -51,18 +42,13 @@ def calculate_robinson_foulds_distance(self, tree_zero, tree_one): plain_rf = self.compare_trees(plain_rf, tree_one, tree_zero) # count the number of tips in a phylogeny tip_count = tree_zero.count_terminals() - + # calculate normalized rf distance - normalized_rf = (plain_rf/(2*(tip_count-3))) + normalized_rf = plain_rf / (2 * (tip_count - 3)) return plain_rf, normalized_rf - def compare_trees( - self, - plain_rf: int, - tree_zero: Tree, - tree_one: Tree - ) -> int: + def compare_trees(self, plain_rf: int, tree_zero: Tree, tree_one: Tree) -> int: # loop through tree_zero and find similar clade in tree_one for clade_zero in tree_zero.get_nonterminals()[1:]: # initialize and populate a list of tip names in tree_zero @@ -73,26 +59,18 @@ def compare_trees( tip_names_one = self.get_tip_names_from_tree(clade_one) # compare the list of tip names plain_rf = self.determine_if_clade_differs( - plain_rf, - tip_names_zero, - tip_names_one + plain_rf, tip_names_zero, tip_names_one ) return plain_rf def determine_if_clade_differs( - self, - plain_rf: int, - tip_names_zero: list, - tip_names_one: list + self, plain_rf: int, tip_names_zero: list, tip_names_one: list ) -> int: """ if clade differs, add 1 to plain_rf value """ if set(tip_names_zero) != set(tip_names_one): - plain_rf +=1 - - return plain_rf - - + plain_rf += 1 + return plain_rf diff --git a/phykit/services/tree/root_tree.py b/phykit/services/tree/root_tree.py index 6d6de62..45dea64 100644 --- a/phykit/services/tree/root_tree.py +++ b/phykit/services/tree/root_tree.py @@ -1,26 +1,25 @@ -import logging - from Bio import Phylo from .base import Tree from ...helpers.files import read_single_column_file_to_list + class RootTree(Tree): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) def run(self): tree = self.read_tree_file() - + outgroup = read_single_column_file_to_list(self.outgroup_taxa_file_path) Phylo.BaseTree.Tree.root_with_outgroup(tree, outgroup) self.write_tree_file(tree, self.output_file_path) - + def process_args(self, args): - tree_file_path=args.tree + tree_file_path = args.tree if args.output is None: output_file_path = f"{tree_file_path}.rooted" @@ -30,5 +29,5 @@ def process_args(self, args): return dict( tree_file_path=tree_file_path, outgroup_taxa_file_path=args.root, - output_file_path=output_file_path - ) \ No newline at end of file + output_file_path=output_file_path, + ) diff --git a/phykit/services/tree/saturation.py b/phykit/services/tree/saturation.py index 2d28670..fa32945 100644 --- a/phykit/services/tree/saturation.py +++ b/phykit/services/tree/saturation.py @@ -2,12 +2,12 @@ import itertools from typing import Tuple -from Bio import AlignIO import scipy from .base import Tree from ...helpers.files import get_alignment_and_format as get_alignment_and_format_helper + class FileFormat(Enum): fasta = "fasta" clustal = "clustal" @@ -18,14 +18,17 @@ class FileFormat(Enum): phylip_rel = "phylip-relaxed" stockholm = "stockholm" + class Saturation(Tree): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) def run(self): # read in alignment - alignment, alignment_format = get_alignment_and_format_helper(self.alignment_file_path) - + alignment, alignment_format = get_alignment_and_format_helper( + self.alignment_file_path + ) + # read in tree tree = self.read_tree_file() @@ -35,36 +38,30 @@ def run(self): # for pairwise combinations, calculate patristic # distances and pairwise identities - patristic_distances, pairwise_identities = self.loop_through_combos_and_calculate_pds_and_pis( - combos, - alignment, - tree - ) - + ( + patristic_distances, + pairwise_identities, + ) = self.loop_through_combos_and_calculate_pds_and_pis(combos, alignment, tree) + # calculate linear regression - _, _, r_value, _, _ = scipy.stats.linregress(pairwise_identities, patristic_distances) - + _, _, r_value, _, _ = scipy.stats.linregress( + pairwise_identities, patristic_distances + ) + # report res self.print_res( - self.verbose, - combos, - pairwise_identities, - patristic_distances, - r_value + self.verbose, combos, pairwise_identities, patristic_distances, r_value ) def process_args(self, args): return dict( tree_file_path=args.tree, alignment_file_path=args.alignment, - verbose=args.verbose + verbose=args.verbose, ) def loop_through_combos_and_calculate_pds_and_pis( - self, - combos: list, - alignment, - tree + self, combos: list, alignment, tree ) -> Tuple[list, list]: """ loop through all taxon combinations and determine @@ -83,18 +80,14 @@ def loop_through_combos_and_calculate_pds_and_pis( return patristic_distances, pairwise_identities def calculate_pairwise_identities( - self, - alignment, - pairwise_identities: list, - aln_len: int, - combo: tuple + self, alignment, pairwise_identities: list, aln_len: int, combo: tuple ) -> list: """ calculate pairwise identities for a given combo """ identities = 0 - seq_one = '' - seq_two = '' + seq_one = "" + seq_two = "" for record in alignment: if record.name == combo[0]: seq_one = record.seq @@ -113,15 +106,19 @@ def print_res( combos: list, pairwise_identities: list, patristic_distances: list, - r_value: float + r_value: float, ) -> None: """ print results to stdout """ try: if verbose: - for combo, pairwise_identity, patristic_distance in zip(combos, pairwise_identities, patristic_distances): - print(f"{combo[0]}-{combo[1]}\t{round(pairwise_identity,4)}\t{round(patristic_distance, 4)}") + for combo, pairwise_identity, patristic_distance in zip( + combos, pairwise_identities, patristic_distances + ): + print( + f"{combo[0]}-{combo[1]}\t{round(pairwise_identity,4)}\t{round(patristic_distance, 4)}" + ) else: print(round(r_value**2, 4)) except BrokenPipeError: diff --git a/phykit/services/tree/spurious_sequence.py b/phykit/services/tree/spurious_sequence.py index aa18785..5914bae 100644 --- a/phykit/services/tree/spurious_sequence.py +++ b/phykit/services/tree/spurious_sequence.py @@ -3,6 +3,7 @@ from .base import Tree + class SpuriousSequence(Tree): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) @@ -10,56 +11,57 @@ def __init__(self, args) -> None: def run(self): tree = self.read_tree_file() factor = self.factor - name_and_branch_len, threshold, median = self.identify_spurious_sequence(tree, factor) - + name_and_branch_len, threshold, median = self.identify_spurious_sequence( + tree, factor + ) + counter = 0 for name, length in name_and_branch_len.items(): if length >= threshold: try: - print(f"{name}\t{round(length, 4)}\t{round(threshold, 4)}\t{round(median, 4)}") + print( + f"{name}\t{round(length, 4)}\t{round(threshold, 4)}\t{round(median, 4)}" + ) except BrokenPipeError: pass counter += 1 - + # if no terminal branch is longer than the one specified # inform the user and print "None" if counter == 0: print("None") - def process_args(self, args): return dict(tree_file_path=args.tree, factor=args.factor) def identify_spurious_sequence(self, tree, factor): - branch_lengths, name_and_branch_len = self.get_branch_lengths_and_their_names(tree) + branch_lengths, name_and_branch_len = self.get_branch_lengths_and_their_names( + tree + ) median = stat.median(branch_lengths) if factor is None: factor = 20 - threshold = median*factor + threshold = median * factor return name_and_branch_len, threshold, median - - def get_branch_lengths_and_their_names( - self, - tree - ) -> Tuple[list, list]: + + def get_branch_lengths_and_their_names(self, tree) -> Tuple[list, list]: # initialize a list to hold branch lengths and a # dictionary with terminal names and the branch # lengths that lead up to them. branch_lengths = [] name_and_branch_len = {} - + # collect terminal branch lengths for terminal in tree.get_terminals(): branch_lengths.append(terminal.branch_length) name_and_branch_len[terminal.name] = terminal.branch_length - + # collect internal branch lengths for internal in tree.get_nonterminals(): branch_lengths.append(terminal.branch_length) - return branch_lengths, name_and_branch_len - + return branch_lengths, name_and_branch_len diff --git a/phykit/services/tree/terminal_branch_stats.py b/phykit/services/tree/terminal_branch_stats.py index 986c5f9..253b577 100644 --- a/phykit/services/tree/terminal_branch_stats.py +++ b/phykit/services/tree/terminal_branch_stats.py @@ -2,7 +2,11 @@ from .base import Tree -from ...helpers.stats_summary import calculate_summary_statistics_from_arr, print_summary_statistics +from ...helpers.stats_summary import ( + calculate_summary_statistics_from_arr, + print_summary_statistics, +) + class TerminalBranchStats(Tree): def __init__(self, args) -> None: @@ -38,25 +42,28 @@ def get_terminal_branch_lengths(self, tree) -> list: temp.append(terminal_branch.name) lengths_and_names.append(temp) - return terminal_branch_lengths, lengths_and_names + return terminal_branch_lengths, lengths_and_names - def check_tree_has_branch_lengths(self, terminal_branch_lengths:list) -> None: + def check_tree_has_branch_lengths(self, terminal_branch_lengths: list) -> None: """ if tree has no branch lengths, exit """ if len(terminal_branch_lengths) == 0: - print("Calculating terminal branch statistics requires a phylogeny with branch lengths.") + print( + "Calculating terminal branch statistics requires a phylogeny with branch lengths." + ) sys.exit() - def calculate_terminal_branch_stats(self, tree): # save terminal branch lengths to terminal_branch_lengths - terminal_branch_lengths, lengths_and_names = self.get_terminal_branch_lengths(tree) - + terminal_branch_lengths, lengths_and_names = self.get_terminal_branch_lengths( + tree + ) + # If the phylogeny had no branch lengths, inform user and quit self.check_tree_has_branch_lengths(terminal_branch_lengths) - + # calculate summary stats stats = calculate_summary_statistics_from_arr(terminal_branch_lengths) - return terminal_branch_lengths, stats, lengths_and_names \ No newline at end of file + return terminal_branch_lengths, stats, lengths_and_names diff --git a/phykit/services/tree/tip_labels.py b/phykit/services/tree/tip_labels.py index 69ff5f0..ee0fe16 100644 --- a/phykit/services/tree/tip_labels.py +++ b/phykit/services/tree/tip_labels.py @@ -1,9 +1,6 @@ -import logging - -from Bio import Phylo - from .base import Tree + class TipLabels(Tree): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) @@ -15,6 +12,6 @@ def run(self): print(leaf.name) except BrokenPipeError: pass - + def process_args(self, args): - return dict(tree_file_path=args.tree) \ No newline at end of file + return dict(tree_file_path=args.tree) diff --git a/phykit/services/tree/tip_to_tip_distance.py b/phykit/services/tree/tip_to_tip_distance.py index 35edf11..b25565f 100644 --- a/phykit/services/tree/tip_to_tip_distance.py +++ b/phykit/services/tree/tip_to_tip_distance.py @@ -29,8 +29,4 @@ def check_leaves(self, tree_zero, tip_1, tip_2): sys.exit() def process_args(self, args): - return dict( - tree_file_path=args.tree_zero, - tip_1=args.tip_1, - tip_2=args.tip_2 - ) + return dict(tree_file_path=args.tree_zero, tip_1=args.tip_1, tip_2=args.tip_2) diff --git a/phykit/services/tree/tip_to_tip_node_distance.py b/phykit/services/tree/tip_to_tip_node_distance.py index 290e662..6ed1bee 100644 --- a/phykit/services/tree/tip_to_tip_node_distance.py +++ b/phykit/services/tree/tip_to_tip_node_distance.py @@ -29,8 +29,4 @@ def check_leaves(self, tree_zero, tip_1, tip_2): sys.exit() def process_args(self, args): - return dict( - tree_file_path=args.tree_zero, - tip_1=args.tip_1, - tip_2=args.tip_2 - ) + return dict(tree_file_path=args.tree_zero, tip_1=args.tip_1, tip_2=args.tip_2) diff --git a/phykit/services/tree/total_tree_length.py b/phykit/services/tree/total_tree_length.py index fc298d1..67bfe83 100644 --- a/phykit/services/tree/total_tree_length.py +++ b/phykit/services/tree/total_tree_length.py @@ -10,7 +10,6 @@ def run(self): total_tree_length = self.calculate_total_tree_length(tree) print(round(total_tree_length, 4)) - def process_args(self, args): return dict(tree_file_path=args.tree) diff --git a/phykit/services/tree/treeness.py b/phykit/services/tree/treeness.py index 70e2ba3..5b8ba66 100644 --- a/phykit/services/tree/treeness.py +++ b/phykit/services/tree/treeness.py @@ -10,6 +10,5 @@ def run(self): treeness = self.calculate_treeness(tree) print(round(treeness, 4)) - def process_args(self, args): return dict(tree_file_path=args.tree) diff --git a/phykit/services/tree/treeness_over_rcv.py b/phykit/services/tree/treeness_over_rcv.py index 959a4d6..538df38 100644 --- a/phykit/services/tree/treeness_over_rcv.py +++ b/phykit/services/tree/treeness_over_rcv.py @@ -1,11 +1,10 @@ from enum import Enum -from Bio import AlignIO - from .base import Tree from ..alignment.base import Alignment from ...helpers.files import get_alignment_and_format + class FileFormat(Enum): fasta = "fasta" clustal = "clustal" @@ -16,6 +15,7 @@ class FileFormat(Enum): phylip_rel = "phylip-relaxed" stockholm = "stockholm" + class TreenessOverRCV(Tree): def __init__(self, args) -> None: super().__init__(**self.process_args(args)) @@ -23,22 +23,24 @@ def __init__(self, args) -> None: def run(self): # calculate treeness treeness = self.calculate_treeness() - + # calculate rcv aln = Alignment(alignment_file_path=self.alignment_file_path) relative_composition_variability = aln.calculate_rcv() # calculate treeness/rcv - treeness_over_rcv = self.calculate_treeness_over_rcv(treeness, relative_composition_variability) - - # print results - print(f"{round(treeness_over_rcv, 4)}\t{round(treeness, 4)}\t{round(relative_composition_variability, 4)}") + treeness_over_rcv = self.calculate_treeness_over_rcv( + treeness, relative_composition_variability + ) + # print results + print( + f"{round(treeness_over_rcv, 4)}\t{round(treeness, 4)}\t{round(relative_composition_variability, 4)}" + ) def process_args(self, args): return dict(tree_file_path=args.tree, alignment_file_path=args.alignment) - + def calculate_treeness_over_rcv(self, treeness, relative_composition_variability): - treeness_over_rcv = (treeness / relative_composition_variability) + treeness_over_rcv = treeness / relative_composition_variability return treeness_over_rcv - diff --git a/phykit/version.py b/phykit/version.py index 666b2f7..438a38d 100644 --- a/phykit/version.py +++ b/phykit/version.py @@ -1 +1 @@ -__version__ = '1.12.0' +__version__ = "1.12.1"