Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ClipKit #93

Open
wants to merge 19 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ nosetests.xml
TreeSAPP.Rproj
.Rhistory
.RData
.python-version*


# Data
Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## [0.11.5] - 2022-06
### Added
### Fixed
### Changed
- Switched BMGE for ClipKit (#67)

## [0.11.4] - 2022-05-22
### Added
- Centroid inference for pOTUs based on the midpoint, or balance point, of all cluster members.
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ tqdm >=4.50.0
pytest >=6.2.5
pandas >=1.1.0
matplotlib >=3.3.0
clipkit >= 1.3.0
9 changes: 2 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from setuptools import Extension
from setuptools import setup, find_packages

with open("README.md", "r") as readme:
Expand All @@ -22,7 +21,7 @@
SETUP_METADATA = \
{
"name": "treesapp",
"version": "0.11.4",
"version": "0.11.5",
"description": "TreeSAPP is a functional and taxonomic annotation tool for genomes and metagenomes.",
"long_description": LONG_DESCRIPTION,
"long_description_content_type": "text/markdown",
Expand All @@ -34,16 +33,12 @@
"include_package_data": True,
"entry_points": {'console_scripts': ['treesapp = treesapp.__main__:main']},
"classifiers": CLASSIFIERS,
"ext_modules": [Extension("_tree_parser",
sources=["treesapp/extensions/tree_parsermodule.cpp"],
language="c++")
],
"install_requires": open("requirements.txt").read().splitlines(),
"setup_requires": [
"setuptools>=50.0.0"
],
"extras_require": {
'test': ['pytest', 'pytest-cov'],
'tests': ['pytest', 'pytest-cov'],
}
}

Expand Down
49 changes: 49 additions & 0 deletions tests/test_assign.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,55 @@ def test_load_refpkg_classifiers(self):

return

def test_bin_hmm_matches_by_region(self):

return

def test_bin_hmm_matches_by_identity(self):
return

def test_write_grouped_fastas(self):
from treesapp import assign
from treesapp import fasta
import random
from string import ascii_uppercase
seq_dict = {}
seq_name_idx = {}
rp_name = "PuhA"
scaler = 3
n_seqs = self.refpkg_dict[rp_name].num_seqs * scaler

# Test with empty inputs
fasta_file_group_map = assign.write_grouped_fastas(extracted_seq_dict=seq_dict,
seq_name_index=seq_name_idx,
refpkg_dict=self.refpkg_dict,
output_dir=self.output_dir)
self.assertEqual({}, fasta_file_group_map)

# Test real condition
seq_dict.update({rp_name: {"99": {-1*n: ''.join(random.choice(ascii_uppercase) for _ in range(50))
for n in range(n_seqs)}}})
seq_name_idx.update({rp_name: {-1*x: "seq_{}".format(x) for x in range(n_seqs)}})
fasta_file_group_map = assign.write_grouped_fastas(extracted_seq_dict=seq_dict,
seq_name_index=seq_name_idx,
refpkg_dict=self.refpkg_dict,
output_dir=self.output_dir)
fasta_files = list(fasta_file_group_map[rp_name])
self.assertEqual(scaler, len(fasta_files))
self.assertEqual([os.path.join(self.output_dir,
"{}_hmm_purified_group{}.faa".format(rp_name, n)) for n in range(scaler)],
fasta_files)
# Ensure there are the right number of sequences in each file
for file_path in fasta_files:
self.assertEqual(self.refpkg_dict[rp_name].num_seqs,
len(fasta.get_headers(file_path)))

return

def test_bin_hmm_matches(self):
self.assertTrue(False)
return


if __name__ == '__main__':
unittest.main()
5 changes: 3 additions & 2 deletions tests/test_classy.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,12 @@ def test_furnish_with_arguments(self):
args.input = [self.fasta]
args.output = self.output_dir
args.molecule = "prot"
args.executables = {'prodigal': '/home/connor/bin/prodigal', 'BMGE.jar': '/usr/local/bin/BMGE.jar',
args.executables = {'prodigal': '/home/connor/bin/prodigal',
'hmmbuild': '/usr/local/bin/hmmbuild',
'hmmalign': '/usr/local/bin/hmmalign',
'hmmsearch': '/usr/local/bin/hmmsearch',
'epa-ng': '/usr/local/bin/epa-ng', 'raxml-ng': '/usr/local/bin/raxml-ng'}
'epa-ng': '/usr/local/bin/epa-ng',
'raxml-ng': '/usr/local/bin/raxml-ng'}
self.db.furnish_with_arguments(args)
self.assertEqual(len(args.executables), len(self.db.executables))
self.assertEqual(self.fasta, self.db.input_sequences)
Expand Down
43 changes: 43 additions & 0 deletions tests/test_clipkit_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import os
import unittest

from .testing_utils import get_test_data


class MyTestCase(unittest.TestCase):
def setUp(self) -> None:
self.test_fa = get_test_data('PuhA.mfa')
self.output_fa = 'PuhA.trim.mfa'

def tearDown(self) -> None:
if os.path.isfile(self.output_fa):
os.remove(self.output_fa)

def test_run(self):
from treesapp import clipkit_helper
from clipkit import modes as ck_modes
ck = clipkit_helper.ClipKitHelper(fasta_in=self.test_fa,
output_dir='./',
mode="smart-gap",
min_len=200)
ck.run()
ck.compare_original_and_trimmed_multiple_alignments()
ck.summarise_trimming()
self.assertTrue(os.path.isfile(self.output_fa))

ck.mode = ck_modes.TrimmingMode("kpi-smart-gap")
ck.run()
ck.compare_original_and_trimmed_multiple_alignments()
ck.summarise_trimming()
self.assertTrue(os.path.isfile(self.output_fa))

ck.mode = ck_modes.TrimmingMode("gappy")
ck.run()
ck.compare_original_and_trimmed_multiple_alignments()
ck.summarise_trimming()
self.assertTrue(os.path.isfile(self.output_fa))
return


if __name__ == '__main__':
unittest.main()
18 changes: 13 additions & 5 deletions tests/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,17 @@ def test_assign(self):
"--placement_summary", "aelw",
"--trim_align", "--svm"]
assign.assign(assign_commands_list)
self.assertEqual(13, len(read_classification_table(assignments_tbl)))
assigned_queries = 15
self.assertEqual(assigned_queries,
len(read_classification_table(assignments_tbl)))
self.assertTrue(os.path.isfile(classified_seqs_faa))
assign.assign(assign_commands_list + ["--targets", "McrA,McrB,XmoA"])
self.assertEqual(18, len(read_classification_table(assignments_tbl)))
self.assertEqual(18, len(fasta.get_headers(classified_seqs_faa)))
assign.assign(assign_commands_list + ["--targets", "McrA,McrB,XmoA",
"--min_seq_length", str(30)])
assigned_queries = 17
self.assertEqual(assigned_queries,
len(read_classification_table(assignments_tbl)))
self.assertEqual(assigned_queries,
len(fasta.get_headers(classified_seqs_faa)))

# Test nucleotide sequence input WITHOUT targets listed
assign_commands_list = ["--fastx_input", self.nt_test_fa,
Expand All @@ -124,8 +130,10 @@ def test_assign(self):
"--reads", get_test_data("SRR3669912_1.fastq"),
"--reverse", get_test_data("SRR3669912_2.fastq")]
assign.assign(assign_commands_list)
assigned_queries = 8
lines = read_classification_table(assignments_tbl)
self.assertEqual(8, len(lines))
self.assertEqual(assigned_queries,
len(lines))
classified_seqs = set()
pqueries = assignments_to_pqueries(lines)
for rp_pqs in pqueries.values():
Expand Down
39 changes: 0 additions & 39 deletions tests/test_file_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,45 +134,6 @@ def test_read_lineage_ids(self):
read_lineage_ids(get_test_data("McrA_lineage_ids - GTDB_map.tsv"))
return

def test_check_seq_name_integer_compatibility(self):
from treesapp import file_parsers
msa, n_refs = file_parsers.check_seq_name_integer_compatibility(seq_dict=self.test_fasta_data)
self.assertEqual(-1, n_refs)
self.assertEqual("Bad_header_name", msa.popitem()[0])
return

def test_validate_alignment_trimming(self):
from treesapp import file_parsers
from treesapp import fasta
test_msa = get_test_data("PuhA.mfa")
headers = set([str(i) + "_PuhA" for i in range(1, 48)])
tmp_fasta = fasta.FASTA(file_name=test_msa)
tmp_fasta.load_fasta()

# Test bad file extension
with pytest.raises(SystemExit):
file_parsers.validate_alignment_trimming(["PuhA.stk"], headers)

# Fail due to a bad sequence name in a fasta
with pytest.raises(SystemExit):
file_parsers.validate_alignment_trimming([self.test_data_file], set(self.test_fasta_data))

# Ensure success
success, fail, msg = file_parsers.validate_alignment_trimming(msa_files=[test_msa],
unique_ref_headers=set(tmp_fasta.get_seq_names()))
self.assertEqual(32, len(success[test_msa]))
self.assertEqual([], fail)
self.assertIsInstance(msg, str)

# MSA fails due to more sequence names in unique_ref_headers than MSA
success, fail, msg = file_parsers.validate_alignment_trimming(msa_files=[test_msa],
unique_ref_headers=headers)
self.assertEqual({}, success)
self.assertEqual([test_msa], fail)
self.assertIsInstance(msg, str)

return


if __name__ == '__main__':
unittest.main()
2 changes: 1 addition & 1 deletion tests/test_graftm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def test_prep_graftm_ref_files(self):
from treesapp import utilities
# Find the executables
exe_map = {}
for dep in ["hmmbuild", "hmmalign", "raxml-ng", "mafft", "BMGE.jar"]:
for dep in ["hmmbuild", "hmmalign", "raxml-ng", "mafft"]:
exe_map[dep] = utilities.fetch_executable_path(dep, self.ts_dir)

taxon_str = 'd__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhizobiales'
Expand Down
39 changes: 39 additions & 0 deletions tests/test_multiple_alignment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import os
import unittest

from .testing_utils import get_test_data


class MyTestCase(unittest.TestCase):
def test_trim_multiple_alignments(self):
from treesapp import multiple_alignment
from treesapp import refpkg
test_fa = get_test_data('PuhA.mfa')
trim_file = os.path.join("tests", "test_data", "PuhA.trim.mfa")
qc_file = os.path.join("tests", "test_data", "PuhA.trim.qc.mfa")
test_rp = refpkg.ReferencePackage(refpkg_name="PuhA")
test_rp.f__pkl = get_test_data(filename=os.path.join("refpkgs", "PuhA_build.pkl"))
test_rp.slurp()

result = multiple_alignment.trim_multiple_alignment_farmer([{"qry_ref_mfa": test_fa,
"refpkg_name": "PuhA",
"gap_tuned": True,
"avg_id": 88}],
min_seq_length=10,
n_proc=1,
ref_pkgs={"PuhA": test_rp},
for_placement=False)
self.assertTrue(os.path.isfile(trim_file))
self.assertIsInstance(result, dict)
self.assertTrue("PuhA" in result.keys())
self.assertEqual(os.path.basename(qc_file),
os.path.basename(result["PuhA"].pop()))

for f_path in [trim_file, qc_file]:
if os.path.isfile(f_path):
os.remove(f_path)
return


if __name__ == '__main__':
unittest.main()
4 changes: 3 additions & 1 deletion tests/test_placement_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def setUp(self) -> None:
# Executables dictionary
self.exes = {}
self.treesapp_dir = os.path.abspath(os.path.dirname(os.path.realpath(__file__))) + os.sep
for dep in ["hmmbuild", "hmmalign", "raxml-ng", "mafft", "BMGE.jar"]:
for dep in ["hmmbuild", "hmmalign", "raxml-ng", "mafft", "epa-ng"]:
self.exes[dep] = fetch_executable_path(dep, get_treesapp_root())
return

Expand Down Expand Up @@ -80,6 +80,8 @@ def test_clade_exclusion_phylo_placement(self):
'634498.mru_1924'],
'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria': ['523846.Mfer_0784',
'79929.MTBMA_c15480']}}

# This will fail since query sequences (McrA) are unrelated to reference package (PuhA)
pqueries = clade_exclusion_phylo_placement(rank_training_seqs=train_seqs,
test_fasta=self.bad_fasta, ref_pkg=self.test_refpkg,
executables=self.exes, min_seqs=3, output_dir=self.output_dir)
Expand Down
28 changes: 27 additions & 1 deletion tests/test_refpkg.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@ def setUp(self) -> None:
# Find the executables
self.exe_map = {}
self.treesapp_dir = os.path.abspath(os.path.dirname(os.path.realpath(__file__))) + os.sep
for dep in ["hmmbuild", "hmmalign", "raxml-ng", "mafft", "BMGE.jar", "FastTree"]:
for dep in ["hmmbuild", "hmmalign", "raxml-ng", "mafft", "FastTree"]:
self.exe_map[dep] = fetch_executable_path(dep, utils.get_treesapp_root())
self.n_cpu = utils.NUM_THREADS
return

def tearDown(self) -> None:
Expand All @@ -64,6 +65,31 @@ def test_band(self):
self.db.band()
return

def test_blast(self):
import Bio.Align
import timeit

qseq = "AMQIGMSFISXYKVCAGEAAVADLAFAAKHAGVIQMADILPARRARGPNEPGGIKFGHFC" \
"DMIQGDRKYPNDPVRANLEVVAAGAMLFDQIWLGSYMSGGVGFTQYATAAYTDNILDDYC" \
"EYGVDYIKKKHGGIAKAKSTQEVVSDIATEVNLYGMEQYESYPTALESHFGGSQRASVLA" \
"AASGLTCSLATANSNAGLNGWYLSMLMHKEGWSRLGFFGYDLQDQCGSANSMSIRPDEGL" \
"LGELRGPNYPNYAI"
ref_pkg = self.db

exec_time = timeit.timeit(stmt="ref_pkg.blast(qseq=qseq, n_proc=n_proc)",
globals={'ref_pkg': ref_pkg,
'qseq': qseq,
'n_proc': self.n_cpu},
number=10)
self.assertTrue(0 < exec_time < 10)

aln, seq_id, g_seq_id = ref_pkg.blast(qseq,
n_proc=self.n_cpu) # type: Bio.Align.PairwiseAlignment
self.assertEqual(100, aln.score)
self.assertEqual(62, round(seq_id, 0))
self.assertEqual(89, round(g_seq_id, 0))
return

def test_disband(self):
# Basic disband
self.db.disband(output_dir="./tests/")
Expand Down
7 changes: 3 additions & 4 deletions tests/test_training_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def test_generate_pquery_data_for_trainer(self):

treesapp_dir = get_treesapp_root()
executables = {}
for dep in ["hmmbuild", "hmmalign", "hmmsearch", "epa-ng", "raxml-ng", "FastTree", "mafft", "BMGE.jar"]:
for dep in ["hmmbuild", "hmmalign", "hmmsearch", "epa-ng", "raxml-ng", "FastTree", "mafft"]:
executables[dep] = fetch_executable_path(dep, treesapp_dir)
pbar = tqdm()
test_taxon_one = "f__Bradyrhizobiaceae; g__Bradyrhizobium; s__Bradyrhizobium 'sp.' BTAi1"
Expand All @@ -68,10 +68,9 @@ def test_generate_pquery_data_for_trainer(self):

def test_fetch_executable_path(self):
from treesapp.utilities import fetch_executable_path
from re import sub
treesapp_dir = get_treesapp_root()
exe_path = fetch_executable_path("BMGE.jar", treesapp_dir)
self.assertEqual("/sub_binaries/BMGE.jar", sub(treesapp_dir, '', exe_path))
exe_path = fetch_executable_path("epa-ng", treesapp_dir)
self.assertEqual("epa-ng", os.path.basename(exe_path))
return

def test_load_training_data_frame(self):
Expand Down
3 changes: 3 additions & 0 deletions tests/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
import ete3


NUM_THREADS = os.cpu_count()


def random_ete_tree(leaf_names: list, branch_len_dist=None) -> ete3.Tree:
if not branch_len_dist:
branch_len_dist = (0, 1)
Expand Down
Loading