From 2858dd753ffbdcfe5c1a41598d810c6874feac4d Mon Sep 17 00:00:00 2001
From: Christian Diener <ch.diener@gmail.com>
Date: Sun, 29 Oct 2023 12:19:12 -0700
Subject: [PATCH] allow unifying ranks (#145)

* allow unifying ranks

* check if osqp is there

* check if osqp is there
---
 .github/workflows/pythonpackage.yml |  2 +-
 micom/community.py                  |  2 +
 micom/taxonomy.py                   | 79 ++++++++++++++++++++++++-----
 tests/test_osqp.py                  |  6 +++
 tests/test_taxonomy.py              | 35 +++++++++++++
 5 files changed, 111 insertions(+), 13 deletions(-)
 create mode 100644 tests/test_taxonomy.py

diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
index 712c659f..15414c24 100644
--- a/.github/workflows/pythonpackage.yml
+++ b/.github/workflows/pythonpackage.yml
@@ -30,7 +30,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip cryptography
         pip install wheel numpy Cython
-        pip install biom-format
+        pip install biom-format highspy
     - name: Install MICOM
       run: pip install -e .
     - name: install CPLEX
diff --git a/micom/community.py b/micom/community.py
index 9f133446..0f16d40d 100755
--- a/micom/community.py
+++ b/micom/community.py
@@ -21,6 +21,7 @@
 from micom.optcom import optcom, solve
 from micom.problems import cooperative_tradeoff, knockout_taxa
 from micom.qiime_formats import load_qiime_model_db
+from micom.taxonomy import unify_rank_prefixes
 from rich.progress import track
 from tempfile import TemporaryDirectory
 
@@ -209,6 +210,7 @@ def __init__(
                 if r in taxonomy.columns and r in manifest.columns
             ]
             manifest = manifest[keep_cols + ["file"]]
+            taxonomy = unify_rank_prefixes(taxonomy, manifest)
             merged = pd.merge(taxonomy, manifest, on=keep_cols)
 
             self.__db_metrics = pd.Series(
diff --git a/micom/taxonomy.py b/micom/taxonomy.py
index c79be5d4..2099ee12 100644
--- a/micom/taxonomy.py
+++ b/micom/taxonomy.py
@@ -1,6 +1,5 @@
 """Helpers to convert external data to a MICOM taxonomy."""
 
-from micom.community import _ranks
 from micom.qiime_formats import (
     load_qiime_feature_table,
     load_qiime_taxonomy,
@@ -8,26 +7,27 @@
 import pandas as pd
 
 
+RANKS = ["kingdom", "phylum", "class", "order", "family", "genus", "species", "strain"]
+
+
 def build_from_qiime(
-    abundance,
+    abundance: pd.DataFrame,
     taxonomy: pd.Series,
-    collapse_on="genus"
+    collapse_on: str = "genus",
+    trim_rank_prefix: bool = False,
 ) -> pd.DataFrame:
     """Build the specification for the community models."""
-    taxa = taxonomy.str.replace("[\\w_]+__|\\[|\\]", "", regex=True)
+    if trim_rank_prefix:
+        taxa = taxonomy.str.replace("[\\w_]+__|\\[|\\]", "", regex=True)
     taxa = taxa.str.split(";\\s*", expand=True).replace("", None)
-    taxa.columns = _ranks[0 : taxa.shape[1]]
+    taxa.columns = RANKS[0 : taxa.shape[1]]
     taxa["taxid"] = taxonomy.index
     taxa.index == taxa.taxid
 
     if isinstance(collapse_on, str):
         collapse_on = [collapse_on]
 
-    ranks = [
-        r
-        for r in collapse_on
-        if r in taxa.columns
-    ]
+    ranks = [r for r in collapse_on if r in taxa.columns]
     taxa["mapping_ranks"] = taxa[ranks].apply(
         lambda s: "|".join(s.astype("str")), axis=1
     )
@@ -35,7 +35,8 @@ def build_from_qiime(
     abundance = (
         abundance.collapse(
             lambda id_, x: taxa.loc[id_, "mapping_ranks"],
-            axis="observation", norm=False
+            axis="observation",
+            norm=False,
         )
         .to_dataframe(dense=True)
         .T
@@ -48,7 +49,7 @@ def build_from_qiime(
     abundance = pd.merge(
         abundance[abundance.abundance > 0.0],
         taxa[ranks + ["mapping_ranks"]].drop_duplicates(),
-        on="mapping_ranks"
+        on="mapping_ranks",
     )
     abundance["id"] = abundance["mapping_ranks"].replace(
         r"[^A-Za-z0-9_]+", "_", regex=True
@@ -84,3 +85,57 @@ def qiime_to_micom(feature_table, taxonomy, collapse_on="genus"):
     taxonomy = load_qiime_taxonomy(taxonomy)
 
     return build_from_qiime(table, taxonomy, collapse_on)
+
+
+def rank_prefixes(manifest: pd.DataFrame) -> pd.Series:
+    """Get the used prefixes for taxonomic ranks.
+
+    Arguments
+    ---------
+    manifest : pandas.DataFrame
+        A model database manifest.
+
+    Returns
+    -------
+    pandas.Series
+        The detected prefix for each taxonomic rank in the manifest.
+    """
+    ranks = [c for c in manifest.columns if c.lower() in RANKS]
+    prefixes = pd.Series(
+        {r: manifest[r].str.extract(r"^([a-z]__)").iloc[0, 0] for r in ranks}
+    )
+
+    return prefixes
+
+
+def unify_rank_prefixes(taxonomy: pd.DataFrame, manifest: pd.DataFrame) -> pd.DataFrame:
+    """Handle taxonomic rank prefixes in the taxonomy or database manifest.
+
+    Arguments
+    ---------
+    taxonomy : pandas.DataFrame
+        A taxonomy table.
+    manifest : pandas.DataFrame
+        A database manifest.
+
+    Returns
+    -------
+    tuple of pandas.DataFrame
+        The taxonomy with adjusted taxa names consistent with the database.
+    """
+    tax_prefixes = rank_prefixes(taxonomy)
+    db_prefixes = rank_prefixes(manifest)
+    ranks = tax_prefixes.index[tax_prefixes.index.isin(db_prefixes.index)]
+    if all(tax_prefixes[ranks] == db_prefixes[ranks]):
+        return taxonomy
+
+    taxonomy = taxonomy.copy()
+    ranks = [c for c in taxonomy.columns if c.lower() in RANKS]
+    if db_prefixes.isna().all():
+        for r in ranks:
+            taxonomy[r] = taxonomy[r].str.replace(r"^[a-z]__", "", regex=True)
+    else:
+        for r in ranks:
+            taxonomy[r] = db_prefixes[r] + taxonomy[r]
+
+    return taxonomy
diff --git a/tests/test_osqp.py b/tests/test_osqp.py
index 408dfb20..d27c43be 100644
--- a/tests/test_osqp.py
+++ b/tests/test_osqp.py
@@ -4,6 +4,7 @@
 """
 
 from .fixtures import community
+import cobra.util.solver as su
 import micom.data as md
 from micom.workflows import (
     build,
@@ -17,6 +18,11 @@
 import pytest
 from pytest import approx
 
+pytestmark = pytest.mark.skipif(
+    "osqp" not in su.solvers,
+    reason="OSQP not functional here"
+)
+
 medium = load_qiime_medium(md.test_medium)
 db = md.test_db
 
diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py
new file mode 100644
index 00000000..fb51a6fc
--- /dev/null
+++ b/tests/test_taxonomy.py
@@ -0,0 +1,35 @@
+"""Test helper for taxonomy handling."""
+
+from micom.data import test_taxonomy
+import micom.taxonomy as mt
+
+
+no_prefix = test_taxonomy()
+with_prefix = no_prefix.copy()
+with_prefix["genus"] = "g__" + with_prefix["genus"]
+with_prefix["species"] = "s__" + with_prefix["species"]
+
+
+def test_get_prefixes():
+    assert mt.rank_prefixes(no_prefix).isna().all()
+    assert mt.rank_prefixes(with_prefix)["species"] == "s__"
+
+
+def test_unify_all_good():
+    tax = mt.unify_rank_prefixes(no_prefix, no_prefix)
+    assert all(tax.species == no_prefix.species)
+
+    tax = mt.unify_rank_prefixes(with_prefix, with_prefix)
+    assert all(tax.species == with_prefix.species)
+
+
+def test_remove_prefix():
+    tax = mt.unify_rank_prefixes(with_prefix, no_prefix)
+    assert all(tax.genus == no_prefix.genus)
+    assert all(tax.species == no_prefix.species)
+
+
+def test_add_prefix():
+    tax = mt.unify_rank_prefixes(no_prefix, with_prefix)
+    assert all(tax.genus == with_prefix.genus)
+    assert all(tax.species == with_prefix.species)
\ No newline at end of file