PSM Data Class (#368)

* psm data class * PepSpecMatch field naming and documentation
Noble-Lab · Aug 23, 2024 · aee3534 · aee3534
1 parent 67939b8
commit aee3534
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 25 deletions.
diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py
@@ -2,18 +2,56 @@
 
 import collections
 import csv
+import dataclasses
 import operator
 import os
 import re
 from pathlib import Path
-from typing import List
+from typing import List, Tuple, Iterable
 
 import natsort
 
 from .. import __version__
 from ..config import Config
 
 
+@dataclasses.dataclass
+class PepSpecMatch:
+    """
+    Peptide Spectrum Match (PSM) dataclass
+
+    Parameters
+    ----------
+    sequence : str
+        The amino acid sequence of the peptide.
+    spectrum_id : Tuple[str, str]
+        A tuple containing the spectrum identifier in the form
+        (spectrum file name, spectrum file idx)
+    peptide_score : float
+        Score of the match between the full peptide sequence and the
+        spectrum.
+    charge : int
+        The precursor charge state of the peptide ion observed in the spectrum.
+    calc_mz : float
+        The calculated mass-to-charge ratio (m/z) of the peptide based on its
+        sequence and charge state.
+    exp_mz : float
+        The observed (experimental) precursor mass-to-charge ratio (m/z) of the
+        peptide as detected in the spectrum.
+    aa_scores : Iterable[float]
+        A list of scores for individual amino acids in the peptide
+        sequence, where len(aa_scores) == len(sequence)
+    """
+
+    sequence: str
+    spectrum_id: Tuple[str, str]
+    peptide_score: float
+    charge: int
+    calc_mz: float
+    exp_mz: float
+    aa_scores: Iterable[float]
+
+
 class MztabWriter:
     """
     Export spectrum identifications to an mzTab file.
@@ -42,7 +80,7 @@ def __init__(self, filename: str):
             ),
         ]
         self._run_map = {}
-        self.psms = []
+        self.psms: List[PepSpecMatch] = []
 
     def set_metadata(self, config: Config, **kwargs) -> None:
         """
@@ -178,34 +216,39 @@ def save(self) -> None:
                 ]
             )
             for i, psm in enumerate(
-                natsort.natsorted(self.psms, key=operator.itemgetter(1)), 1
+                natsort.natsorted(
+                    self.psms, key=operator.attrgetter("spectrum_id")
+                ),
+                1,
             ):
-                filename, idx = os.path.abspath(psm[1][0]), psm[1][1]
+                filename = os.path.abspath(psm.spectrum_id[0])
+                idx = psm.spectrum_id[1]
                 writer.writerow(
                     [
                         "PSM",
-                        psm[0],  # sequence
+                        psm.sequence,  # sequence
                         i,  # PSM_ID
                         "null",  # accession
                         "null",  # unique
                         "null",  # database
                         "null",  # database_version
                         f"[MS, MS:1003281, Casanovo, {__version__}]",
-                        psm[2],  # search_engine_score[1]
+                        psm.peptide_score,  # search_engine_score[1]
                         # FIXME: Modifications should be specified as
                         #  controlled vocabulary terms.
                         "null",  # modifications
                         # FIXME: Can we get the retention time from the data
                         #  loader?
                         "null",  # retention_time
-                        int(psm[3]),  # charge
-                        psm[4],  # exp_mass_to_charge
-                        psm[5],  # calc_mass_to_charge
+                        psm.charge,  # charge
+                        psm.exp_mz,  # exp_mass_to_charge
+                        psm.calc_mz,  # calc_mass_to_charge
                         f"ms_run[{self._run_map[filename]}]:{idx}",
                         "null",  # pre
                         "null",  # post
                         "null",  # start
                         "null",  # end
-                        psm[6],  # opt_ms_run[1]_aa_scores
+                        # opt_ms_run[1]_aa_scores
+                        ",".join(list(map("{:.5f}".format, psm.aa_scores))),
                     ]
                 )
diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py
@@ -914,15 +914,15 @@ def on_predict_batch_end(
             if len(peptide) == 0:
                 continue
             self.out_writer.psms.append(
-                (
-                    peptide,
-                    tuple(spectrum_i),
-                    peptide_score,
-                    charge,
-                    precursor_mz,
-                    self.peptide_mass_calculator.mass(peptide, charge),
-                    ",".join(list(map("{:.5f}".format, aa_scores))),
-                ),
+                ms_io.PepSpecMatch(
+                    sequence=peptide,
+                    spectrum_id=tuple(spectrum_i),
+                    peptide_score=peptide_score,
+                    charge=int(charge),
+                    calc_mz=precursor_mz,
+                    exp_mz=self.peptide_mass_calculator.mass(peptide, charge),
+                    aa_scores=aa_scores,
+                )
             )
 
     def _log_history(self) -> None:

diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py
@@ -132,7 +132,7 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None:
             Index containing the annotated spectra used to generate model
             predictions
         """
-        model_output = [psm[0] for psm in self.writer.psms]
+        model_output = [psm.sequence for psm in self.writer.psms]
         spectrum_annotations = [
             test_index[i][4] for i in range(test_index.n_spectra)
         ]

diff --git a/casanovo/utils.py b/casanovo/utils.py
@@ -1,13 +1,11 @@
 """Small utility functions"""
 
-import heapq
 import logging
 import os
 import platform
 import re
 import socket
 import sys
-import time
 from datetime import datetime
 from typing import Tuple, Dict, List, Optional
 
@@ -16,6 +14,8 @@
 import psutil
 import torch
 
+from .data.ms_io import PepSpecMatch
+
 
 SCORE_BINS = [0.0, 0.5, 0.9, 0.95, 0.99]
 
@@ -195,7 +195,7 @@ def log_run_report(
 
 
 def log_sequencing_report(
-    predictions: Tuple[str, Tuple[str, str], float, float, float, float, str],
+    predictions: List[PepSpecMatch],
     start_time: Optional[int] = None,
     end_time: Optional[int] = None,
     score_bins: List[float] = SCORE_BINS,
@@ -219,8 +219,8 @@ def log_sequencing_report(
     run_report = get_report_dict(
         pd.DataFrame(
             {
-                "sequence": [psm[0] for psm in predictions],
-                "score": [psm[2] for psm in predictions],
+                "sequence": [psm.sequence for psm in predictions],
+                "score": [psm.peptide_score for psm in predictions],
             }
         ),
         score_bins=score_bins,