add MS2 annotation and logo

axelwalter · Apr 24, 2024 · bf45b36 · bf45b36
1 parent c06608c
commit bf45b36
Show file tree

Hide file tree

Showing 11 changed files with 171 additions and 71 deletions.
diff --git a/Home.py b/Home.py
@@ -4,9 +4,9 @@
 
 params = page_setup(page="main")
 
+st.image("assets/umetaflow-logo.png", width=300)
 st.markdown(
     """
-# UmetaFlow
 ## A universal metabolomics tool
 
 This app is based on the [UmetaFlow](https://chemrxiv.org/engage/chemrxiv/article-details/634fb68fdfbd2b6abc5c5fcd) workflow for LC-MS data analysis. UmetaFlow is implemented as a [snakemake pipeline](https://github.com/NBChub/snakemake-UmetaFlow) and as a Python version in [Jupyter notebooks](https://github.com/eeko-kon/pyOpenMS_UmetaFlow) based on [pyOpenMS](https://pyopenms.readthedocs.io/en/latest/index.html).

diff --git a/README.md b/README.md
@@ -1,6 +1,16 @@
-# UmetaFlow GUI  [Open!](https://abi-services.cs.uni-tuebingen.de/umetaflow/)
-
-<img src="assets/pyopenms_transparent_background.png" width=20%>
+<table>
+  <tr>
+    <td>
+      ![umetaflow-logo](assets/umetaflow-logo.png)
+    </td>
+    <td>
+      ![pyopenms-logo](assets/pyopenms_transparent_background.png)
+    </td>
+    <td>
+      [Open app](https://abi-services.cs.uni-tuebingen.de/umetaflow/)
+    </td>
+  </tr>
+</table>
 
 This app is based on the [UmetaFlow](https://chemrxiv.org/engage/chemrxiv/article-details/634fb68fdfbd2b6abc5c5fcd) workflow for LC-MS data analysis. UmetaFlow is implemented as a [snakemake pipeline](https://github.com/NBChub/snakemake-UmetaFlow) and as a Python version in [Jupyter notebooks](https://github.com/eeko-kon/pyOpenMS_UmetaFlow) based on [pyOpenMS](https://pyopenms.readthedocs.io/en/latest/index.html).
 

diff --git a/assets/umetaflow-logo.png b/assets/umetaflow-logo.png
diff --git a/pages/4_🧪_UmetaFlow.py b/pages/4_🧪_UmetaFlow.py
@@ -11,6 +11,7 @@
 c1.title("UmetaFlow")
 v_space(1, c2)
 results_only = c2.toggle("view results only")
+st.markdown("💡**Minimal interface with pyOpenMS only. For more advanced use cases and fast execution times use UmetaFlow TOPP. Requires OpenMS TOPP tools to be installed.**")
 
 results_dir = Path(st.session_state.workspace, "umetaflow-results")
 

diff --git a/pages/6_🚧_Testing.py b/pages/6_🚧_Testing.py
@@ -5,5 +5,7 @@
 import pyopenms as poms
 
 from pathlib import Path
-import zipfile
+from pyteomics.mztab import MzTab
+
+params = {"in": []}
 
diff --git a/src/UmetaFlowTOPPWorkflow.py b/src/UmetaFlowTOPPWorkflow.py
@@ -27,8 +27,9 @@ def configure(self) -> None:
             [
                 "**Pre-Processing**",
                 "Re-Quantification",
-                "Input files for SIRIUS & GNPS",
                 "Annotation by in-house library",
+                "SIRIUS",
+                "GNPS FBMN"
             ]
         )
         with tabs[0]:
@@ -100,50 +101,38 @@ def configure(self) -> None:
             )
             self.ui.input_TOPP("FeatureFinderMetaboIdent")
         with tabs[2]:
-            st.markdown("**Export input files**")
+            t = st.tabs(["MS1", "MS2"])
+            with t[0]:
+                self.ui.input_widget("annotate-ms1", False, "annotate consensus features", help="Based on m/z and RT")
+                self.ui.simple_file_uploader("ms1-library", "tsv", "MS1 library in tsv format")
+                self.ui.input_python("annotate-ms1", num_cols=2)
+            with t[1]:
+                self.ui.input_widget("annotate-ms2", False, "annotate consensus features", help="Based on MS2 spectrum similarity.")
+                self.ui.simple_file_uploader("ms2-library", "mgf", "MS2 library in mgf format")
+                self.ui.input_TOPP("MetaboliteSpectralMatcher")
+        with tabs[3]:
             self.ui.input_widget(
                 "export-sirius",
                 False,
                 "export files for SIRIUS",
                 help="Generate input files for SIRIUS from raw data and feature information using the OpenMS TOPP tool *SiriusExport*.",
             )
+            self.ui.input_TOPP("SiriusExport")
+            # st.markdown("**Run SIRIUS and annotate features**")
+            # self.ui.input_widget("run-sirius", False, "run SIRIUS and annotate features")
+            # self.ui.input_widget
+            # t = st.tabs(["Formula prediction: SIRIUS", "Structure prediction: CSI : FingerID", "CANOPUS"])
+        with tabs[4]:
             self.ui.input_widget(
                 "export-gnps",
                 False,
                 "export files for GNPS FBMN and IIMN",
                 help="Generate input files for GNPS feature based molecular networking (FBMN) and ion identity molecular networking (IIMN) from raw data and feature information using the OpenMS TOPP tool *GNPSExport*.",
             )
-            t = st.tabs(["**SIRIUS**", "**GNPS**"])
-            with t[0]:
-                self.ui.input_TOPP("SiriusExport")
-            with t[1]:
-                self.ui.input_TOPP("GNPSExport")
-            st.markdown("**Run SIRIUS and annotate features**")
-            self.ui.input_widget("run-sirius", False, "run SIRIUS and annotate features")
-            self.ui.input_widget
-            t = st.tabs(["Formula prediction: SIRIUS", "Structure prediction: CSI : FingerID", "CANOPUS"])
-        with tabs[3]:
-            t = st.tabs(["MS1", "MS2"])
-            with t[0]:
-                self.ui.input_widget("annotate-ms1", False, "annotate consensus features", help="Based on m/z and RT")
-                self.ui.simple_file_uploader("ms1-library", "tsv", "MS1 library in tsv format")
-                self.ui.input_python("annotate-ms1", num_cols=2)
-            with t[1]:
-                self.ui.input_widget("annotate-ms2", False, "annotate consensus features", help="Based on MS2 spectrum similarity.")
-                self.ui.simple_file_uploader("ms2-library", "mgf", "MS2 library in mgf format")
-
+            self.ui.input_TOPP("GNPSExport")
+
     def execution(self) -> None:
-        # Set log levels from st.session_state
-        if st.session_state["log_level"] in [
-            "commands and execution times",
-            "show all",
-        ]:
-            self.executor.log_commands = True
-        if st.session_state["log_level"] in ["tool outputs", "show all"]:
-            self.executor.log_tool_outputs = True
-
-
-    #   # Get mzML files
+        # Get mzML files
         df_path = Path(st.session_state.workspace, "mzML-files.tsv")
 
         if not df_path.exists():
@@ -157,7 +146,7 @@ def execution(self) -> None:
             # Construct full file paths
             mzML = [str(Path(st.session_state.workspace, "mzML-files", file_name)) for file_name in selected_files]
 
-        if len(mzML) <= 1:
+        if len(mzML) == 0:
             self.logger.log("ERROR: Select at leat two mzML files to run this workflow.")
             return
 
@@ -353,7 +342,7 @@ def execution(self) -> None:
                     "out": self.file_manager.get_files(mzML, "ms", "sirius-export"),
                 },
             )
-        if self.params["export-gnps"]:
+        if self.params["export-gnps"] or self.params["annotate-ms2"]:
             self.logger.log("Exporting input files for GNPS.")
             # Map MS2 specs to features
             self.executor.run_topp(
@@ -383,6 +372,8 @@ def execution(self) -> None:
                 "export_consensus_df",
                 {"in": gnps_consensus, "out": consensus_df},
             )
+
+        if self.params["export-gnps"]:
             # Filter consensus features which have missing values
             self.executor.run_topp(
                 "FileFilter",
@@ -417,10 +408,19 @@ def execution(self) -> None:
                 if files:
                     self.logger.log("Annotating consensus features on MS1 level.")
                     self.executor.run_python("annotate-ms1", {"in": consensus_df, "in_lib": str(files[0])})
-
+
+        if self.params["annotate-ms2"]:
+            dir_path = Path(self.workflow_dir, "input-files", "ms2-library")
+            if dir_path.exists():
+                files = [p for p in dir_path.iterdir()]
+                if files:
+                    self.logger.log("Annotating consensus features on MS2 level.")
+                    ms2_matches = self.file_manager.get_files(mzML, "mzTab", "ms2-matches")
+                    self.executor.run_topp("MetaboliteSpectralMatcher", {"in": mzML, "database": self.file_manager.get_files(str(files[0])), "out": ms2_matches})
+                    self.executor.run_python("annotate-ms2", {"in": consensus_df})
+
         # ZIP all relevant files for Download
         self.executor.run_python("zip-result-files", {"in": consensus_df})
-
 
     def results(self) -> None:
         def load_parquet(file):
@@ -486,6 +486,7 @@ def quality_colors(value):
                 "intensity",
                 df_matrix.apply(lambda row: [row[col] for col in sample_cols], axis=1),
             )
+            df_matrix.set_index("metabolite", inplace=True)
             st.dataframe(
                 df_matrix,
                 column_order=[
@@ -495,7 +496,7 @@ def quality_colors(value):
                     "charge",
                     "adduct",
                     "MS1 annotation",
-                    "metabolite",
+                    "MS2 annotation",
                 ],
                 hide_index=False,
                 column_config={
@@ -517,12 +518,11 @@ def quality_colors(value):
 
         with tabs[1]:
             c1, c2 = st.columns(2)
-            metabolite = c1.selectbox("Select metabolite", df_matrix["metabolite"])
+            metabolite = c1.selectbox("Select metabolite", df_matrix.index)
 
             @st.cache_data
             def get_chroms_for_each_sample(metabolite):
                 # Get index of row in df_matrix where "metabolite" is equal to metabolite
-                index = df_matrix[df_matrix["metabolite"] == metabolite].index[0]
                 all_samples = [
                     col.replace(".mzML_IDs", "")
                     for col in df_matrix.columns
@@ -532,7 +532,7 @@ def get_chroms_for_each_sample(metabolite):
                 samples = []
                 for sample in all_samples:
                     # Get feature ID for sample
-                    fid = df_matrix.loc[index, sample + ".mzML_IDs"]
+                    fid = df_matrix.loc[metabolite, sample + ".mzML_IDs"]
                     path = Path(feature_df_dir, sample + ".parquet")
                     f_df = load_parquet(path)
                     if fid in f_df.index:
@@ -639,7 +639,7 @@ def get_feature_intensity_plot(df):
                 ),
 
         with tabs[3]:
-            if st.button("Prepare result files for download", type="primary"):
+            if st.button("Prepare result files for download"):
                 with open(Path(self.workflow_dir, "results", "results.zip"), "rb") as fp:
                     st.download_button(
                         label="Download Results",

diff --git a/src/common.py b/src/common.py
@@ -93,7 +93,7 @@ def page_setup(page: str = "", help_text: str = "") -> dict[str, Any]:
     # Set Streamlit page configurations
     st.set_page_config(
         page_title=APP_NAME,
-        page_icon="assets/icon.png",
+        page_icon="assets/umetaflow-logo.png",
         layout="wide",
         initial_sidebar_state="auto",
         menu_items=None
@@ -243,7 +243,9 @@ def change_workspace():
             )
         if help_text:
             st.info(help_text)
-        st.image("assets/pyopenms_transparent_background.png", "powered by")
+        c1, c2 = st.columns(2)
+        c1.image("assets/pyopenms_transparent_background.png")
+        c2.image("assets/umetaflow-logo.png")
     return params
 
 def v_space(n: int, col=None) -> None:

diff --git a/src/python-tools/annotate-ms2.py b/src/python-tools/annotate-ms2.py
@@ -0,0 +1,68 @@
+import json
+import sys
+from pathlib import Path
+import pandas as pd
+from pyteomics.mztab import MzTab
+
+############################
+# default paramter values #
+###########################
+#
+# Mandatory keys for each parameter
+# key: a unique identifier
+# value: the default value
+#
+# Optional keys for each parameter
+# name: the name of the parameter
+# hide: don't show the parameter in the parameter section (e.g. for input/output files)
+# options: a list of valid options for the parameter
+# min: the minimum value for the parameter (int and float)
+# max: the maximum value for the parameter (int and float)
+# step_size: the step size for the parameter (int and float)
+# help: a description of the parameter
+# widget_type: the type of widget to use for the parameter (default: auto)
+# advanced: whether or not the parameter is advanced (default: False)
+
+DEFAULTS = [
+    {"key": "in", "value": [], "help": "feature matrix parquet file", "hide": True},
+]
+
+def get_params():
+    if len(sys.argv) > 1:
+        with open(sys.argv[1], "r") as f:
+            return json.load(f)
+    else:
+        return {}
+
+if __name__ == "__main__":
+    params = get_params()
+
+    df = pd.read_parquet(params["in"][0])
+
+    df.insert(3, "MS2 annotation", "")
+
+    if not df["MS2_native_specs"].any():
+        print("No MS2 native spectra found for MS2 annotation.")
+
+    def annotate(x, mztab_file, df_mztab):
+        # get spec ids for the matching mztab file
+        spec_ids = [spec[1] for spec in [spec.split("_") for spec in x["MS2_native_specs"].split(";")] if Path(spec[0]).stem == Path(mztab_file).stem]
+        ann = x["MS2 annotation"]
+        if spec_ids:
+            for sid in spec_ids:
+                if sid in df_mztab["native id"].values:
+                    tmp_ann = df_mztab[df_mztab["native id"] == sid]["description"].values[0]
+                    if ann:
+                        ann += "; "
+                    ann += tmp_ann
+        return ann
+
+    mztab_dir = Path(Path(params["in"][0]).parent.parent, "ms2-matches")
+    for file in mztab_dir.iterdir():
+        spectralmatch = MzTab(str(file), encoding="UTF8", table_format="df")
+        spectralmatch.metadata
+        df_mztab = spectralmatch.small_molecule_table
+        df_mztab["native id"] = df_mztab["opt_spec_native_id"].str.split("=").str.get(-1)
+        df["MS2 annotation"] = df.apply(annotate, args=(str(file), df_mztab), axis=1)
+
+    df.to_parquet(params["in"][0])
diff --git a/src/python-tools/export_consensus_df.py b/src/python-tools/export_consensus_df.py
@@ -89,24 +89,15 @@ def get_params():
     for i, f in enumerate(fnames):
         df[f"{fnames[i]}_IDs"] = ids[i]
 
+    # annotate spectrum IDs for MS2 specs associated with feature
+    df["MS2_native_specs"] = [";".join([f"{fnames[p.getMetaValue('map_index')]}_{p.getMetaValue('spectrum_index')+1}" for p in f.getPeptideIdentifications()]) for f in consensus_map]
+
     # Rename columns to not show full file path
     df = df.rename(columns={col: Path(col).name for col in df.columns if Path(col).exists()})
 
     df = df.reset_index(drop=True)
     df.index = df.index + 1
     df.index.name = "id"
-
-    # # Get info if requantified or not from one (first) of the feature maps
-    # df["re-quantified"] = df[[c for c in df.columns if c.endswith("_IDs")]].apply(lambda x: x.isna().any(), axis=1)
-
-    # # Function to generate evenly spaced ranks
-    # def generate_ranks(group):
-    #     group = group.sort_values("quality")  # Sort by quality
-    #     group["quality ranked"] = np.linspace(0, 1, len(group))  # Generate ranks
-    #     return group
-
-    # # Apply the function to each group and concatenate the results
-    # df = df.groupby("re-quantified").apply(generate_ranks).sort_values("quality ranked", ascending=False)
 
     path = Path(params["out"][0])
     df.to_parquet(path)

diff --git a/src/python-tools/export_ffmid_df.py b/src/python-tools/export_ffmid_df.py
@@ -24,7 +24,7 @@
 # advanced: whether or not the parameter is advanced (default: False)
 
 DEFAULTS = [
-    {"key": "in", "value": [], "help": "ffm featureXML dir", "hide": True},
+    {"key": "in", "value": [], "help": "ffmid featureXML dir", "hide": True},
 ]
 
 def get_params():
@@ -41,7 +41,6 @@ def get_params():
     if not out_path.exists():
         out_path.mkdir(exist_ok=True)
     for file in Path(params["in"][0]).parent.glob("*.featureXML"):
-        print(file)
         fm = poms.FeatureMap()
         poms.FeatureXMLFile().load(str(file), fm)
         # Get DataFrame with meta values