Noble-Lab · Lilferrit · Sep 25, 2024 · Sep 12, 2024 · Sep 12, 2024 · Sep 12, 2024
diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py
@@ -2,54 +2,17 @@
 
 import collections
 import csv
-import dataclasses
 import operator
 import os
 import re
 from pathlib import Path
-from typing import List, Tuple, Iterable
+from typing import List
 
 import natsort
 
 from .. import __version__
 from ..config import Config
-
-
-@dataclasses.dataclass
-class PepSpecMatch:
-    """
-    Peptide Spectrum Match (PSM) dataclass
-
-    Parameters
-    ----------
-    sequence : str
-        The amino acid sequence of the peptide.
-    spectrum_id : Tuple[str, str]
-        A tuple containing the spectrum identifier in the form
-        (spectrum file name, spectrum file idx)
-    peptide_score : float
-        Score of the match between the full peptide sequence and the
-        spectrum.
-    charge : int
-        The precursor charge state of the peptide ion observed in the spectrum.
-    calc_mz : float
-        The calculated mass-to-charge ratio (m/z) of the peptide based on its
-        sequence and charge state.
-    exp_mz : float
-        The observed (experimental) precursor mass-to-charge ratio (m/z) of the
-        peptide as detected in the spectrum.
-    aa_scores : Iterable[float]
-        A list of scores for individual amino acids in the peptide
-        sequence, where len(aa_scores) == len(sequence)
-    """
-
-    sequence: str
-    spectrum_id: Tuple[str, str]
-    peptide_score: float
-    charge: int
-    calc_mz: float
-    exp_mz: float
-    aa_scores: Iterable[float]
+from .psm import PepSpecMatch
 
 
 class MztabWriter:

diff --git a/casanovo/data/psm.py b/casanovo/data/psm.py
@@ -0,0 +1,41 @@
+"""Peptide spectrum match dataclass"""
+
+import dataclasses
+from typing import Tuple, Iterable
+
+
+@dataclasses.dataclass
+class PepSpecMatch:
+    """
+    Peptide Spectrum Match (PSM) dataclass
+
+    Parameters
+    ----------
+    sequence : str
+        The amino acid sequence of the peptide.
+    spectrum_id : Tuple[str, str]
+        A tuple containing the spectrum identifier in the form
+        (spectrum file name, spectrum file idx)
+    peptide_score : float
+        Score of the match between the full peptide sequence and the
+        spectrum.
+    charge : int
+        The precursor charge state of the peptide ion observed in the spectrum.
+    calc_mz : float
+        The calculated mass-to-charge ratio (m/z) of the peptide based on its
+        sequence and charge state.
+    exp_mz : float
+        The observed (experimental) precursor mass-to-charge ratio (m/z) of the
+        peptide as detected in the spectrum.
+    aa_scores : Iterable[float]
+        A list of scores for individual amino acids in the peptide
+        sequence, where len(aa_scores) == len(sequence)
+    """
+
+    sequence: str
+    spectrum_id: Tuple[str, str]
+    peptide_score: float
+    charge: int
+    calc_mz: float
+    exp_mz: float
+    aa_scores: Iterable[float]
diff --git a/casanovo/denovo/evaluate.py b/casanovo/denovo/evaluate.py
@@ -225,8 +225,14 @@
         # Split peptides into individual AAs if necessary.
         if isinstance(peptide1, str):
             peptide1 = re.split(r"(?<=.)(?=[A-Z])", peptide1)
+        elif peptide1 is None:
+            peptide1 = []
+
         if isinstance(peptide2, str):
             peptide2 = re.split(r"(?<=.)(?=[A-Z])", peptide2)
+        elif peptide2 is None:
+            peptide2 = []
+
         n_aa1, n_aa2 = n_aa1 + len(peptide1), n_aa2 + len(peptide2)
         aa_matches_batch.append(
             aa_match(

diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py
@@ -16,7 +16,7 @@
 
 from . import evaluate
 from .. import config
-from ..data import ms_io
+from ..data import ms_io, psm
 
 logger = logging.getLogger("casanovo")
 
@@ -914,7 +914,7 @@ def on_predict_batch_end(
             if len(peptide) == 0:
                 continue
             self.out_writer.psms.append(
-                ms_io.PepSpecMatch(
+                psm.PepSpecMatch(
                     sequence=peptide,
                     spectrum_id=tuple(spectrum_i),
                     peptide_score=peptide_score,

diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py
@@ -163,18 +163,28 @@
             Index containing the annotated spectra used to generate model
             predictions
         """
-        model_output = [psm.sequence for psm in self.writer.psms]
-        spectrum_annotations = [
-            test_index[i][4] for i in range(test_index.n_spectra)
-        ]
+        seq_pred = []
+        seq_true = []
+        pred_idx = 0
+
+        with test_index as t_ind:
+            for true_idx in range(t_ind.n_spectra):
+                seq_true.append(t_ind[true_idx][4])
+                if pred_idx < len(self.writer.psms) and self.writer.psms[
+                    pred_idx
+                ].spectrum_id == t_ind.get_spectrum_id(true_idx):
+                    seq_pred.append(self.writer.psms[pred_idx].sequence)
+                    pred_idx += 1
+                else:
+                    seq_pred.append(None)
+
         aa_precision, _, pep_precision = aa_match_metrics(
             *aa_match_batch(
-                spectrum_annotations,
-                model_output,
+                seq_true,
+                seq_pred,
                 depthcharge.masses.PeptideMass().masses,
             )
         )
-
         logger.info("Peptide Precision: %.2f%%", 100 * pep_precision)
         logger.info("Amino Acid Precision: %.2f%%", 100 * aa_precision)
 
@@ -272,7 +282,7 @@
        tb_summarywriter = None
        if self.config.tb_summarywriter:
            if self.output_dir is None:
                logger.warning(
                    "Can not create tensorboard because the output directory "
                    "is not set in the model runner."
                )

diff --git a/casanovo/utils.py b/casanovo/utils.py
@@ -15,7 +15,7 @@
 import psutil
 import torch
 
-from .data.ms_io import PepSpecMatch
+from .data.psm import PepSpecMatch
 
 
 SCORE_BINS = [0.0, 0.5, 0.9, 0.95, 0.99]

diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py
@@ -1,11 +1,13 @@
 """Unit tests specifically for the model_runner module."""
 
+import unittest.mock
 from pathlib import Path
 
 import pytest
 import torch
 
 from casanovo.config import Config
+from casanovo.data.psm import PepSpecMatch
 from casanovo.denovo.model_runner import ModelRunner
 
 
@@ -282,3 +284,171 @@ def test_evaluate(
             )
 
     result_file.unlink()
+
+
+def test_log_metrics(monkeypatch, tiny_config):
+    def get_mock_index(psm_list):
+        mock_test_index = unittest.mock.MagicMock()
+        mock_test_index.__enter__.return_value = mock_test_index
+        mock_test_index.__exit__.return_value = False
+        mock_test_index.n_spectra = len(psm_list)
+        mock_test_index.get_spectrum_id = lambda idx: psm_list[idx].spectrum_id
+
+        mock_spectra = [
+            (None, None, None, None, curr_psm.sequence)
+            for curr_psm in psm_list
+        ]
+        mock_test_index.__getitem__.side_effect = lambda idx: mock_spectra[idx]
+        return mock_test_index
+
+    def get_mock_psm(sequence, spectrum_id):
+        return PepSpecMatch(
+            sequence=sequence,
+            spectrum_id=spectrum_id,
+            peptide_score=None,
+            charge=None,
+            exp_mz=None,
+            aa_scores=None,
+            calc_mz=None,
+        )
+
+    with monkeypatch.context() as ctx:
+        mock_logger = unittest.mock.MagicMock()
+        ctx.setattr("casanovo.denovo.model_runner.logger", mock_logger)
+
+        with ModelRunner(Config(tiny_config)) as runner:
+            runner.writer = unittest.mock.MagicMock()
+
+            # Test 100% peptide precision
+            infer_psms = [
+                get_mock_psm("PEP", ("foo", "index=1")),
+                get_mock_psm("PET", ("foo", "index=2")),
+            ]
+
+            act_psms = [
+                get_mock_psm("PEP", ("foo", "index=1")),
+                get_mock_psm("PET", ("foo", "index=2")),
+            ]
+
+            runner.writer.psms = infer_psms
+            mock_index = get_mock_index(act_psms)
+            runner.log_metrics(mock_index)
+
+            pep_precision = mock_logger.info.call_args_list[-2][0][1]
+            aa_precision = mock_logger.info.call_args_list[-1][0][1]
+            assert pep_precision == pytest.approx(100)
+            assert aa_precision == pytest.approx(100)
+
+            # Test 50% peptide precision (one wrong)
+            infer_psms = [
+                get_mock_psm("PEP", ("foo", "index=1")),
+                get_mock_psm("PET", ("foo", "index=2")),
+            ]
+
+            act_psms = [
+                get_mock_psm("PEP", ("foo", "index=1")),
+                get_mock_psm("PEP", ("foo", "index=2")),
+            ]
+
+            runner.writer.psms = infer_psms
+            mock_index = get_mock_index(act_psms)
+            runner.log_metrics(mock_index)
+
+            pep_precision = mock_logger.info.call_args_list[-2][0][1]
+            aa_precision = mock_logger.info.call_args_list[-1][0][1]
+            assert pep_precision == pytest.approx(100 * (1 / 2))
+            assert aa_precision == pytest.approx(100 * (5 / 6))
+
+            # Test skipped spectra
+            act_psms = [
+                get_mock_psm("PEP", ("foo", "index=1")),
+                get_mock_psm("PET", ("foo", "index=2")),
+                get_mock_psm("PEI", ("foo", "index=3")),
+                get_mock_psm("PEG", ("foo", "index=4")),
+                get_mock_psm("PEA", ("foo", "index=5")),
+            ]
+
+            infer_psms = [
+                get_mock_psm("PEP", ("foo", "index=1")),
+                get_mock_psm("PET", ("foo", "index=2")),
+                get_mock_psm("PEI", ("foo", "index=3")),
+                get_mock_psm("PEA", ("foo", "index=5")),
+            ]
+
+            runner.writer.psms = infer_psms
+            mock_index = get_mock_index(act_psms)
+            runner.log_metrics(mock_index)
+
+            pep_precision = mock_logger.info.call_args_list[-2][0][1]
+            aa_precision = mock_logger.info.call_args_list[-1][0][1]
+            assert pep_precision == pytest.approx(100 * (4 / 5))
+            assert aa_precision == pytest.approx(100)
+
+            infer_psms = [
+                get_mock_psm("PEP", ("foo", "index=1")),
+                get_mock_psm("PET", ("foo", "index=2")),
+                get_mock_psm("PEI", ("foo", "index=3")),
+                get_mock_psm("PEG", ("foo", "index=4")),
+            ]
+
+            runner.writer.psms = infer_psms
+            mock_index = get_mock_index(act_psms)
+            runner.log_metrics(mock_index)
+
+            pep_precision = mock_logger.info.call_args_list[-2][0][1]
+            aa_precision = mock_logger.info.call_args_list[-1][0][1]
+            assert pep_precision == pytest.approx(100 * (4 / 5))
+            assert aa_precision == pytest.approx(100)
+
+            infer_psms = [
+                get_mock_psm("PEP", ("foo", "index=1")),
+                get_mock_psm("PEI", ("foo", "index=3")),
+            ]
+
+            runner.writer.psms = infer_psms
+            mock_index = get_mock_index(act_psms)
+            runner.log_metrics(mock_index)
+
+            pep_precision = mock_logger.info.call_args_list[-2][0][1]
+            aa_precision = mock_logger.info.call_args_list[-1][0][1]
+            assert pep_precision == pytest.approx(100 * (2 / 5))
+            assert aa_precision == pytest.approx(100)
+
+            infer_psms = [
+                get_mock_psm("PEP", ("foo", "index=1")),
+                get_mock_psm("PEA", ("foo", "index=5")),
+            ]
+
+            runner.writer.psms = infer_psms
+            mock_index = get_mock_index(act_psms)
+            runner.log_metrics(mock_index)
+
+            pep_precision = mock_logger.info.call_args_list[-2][0][1]
+            aa_precision = mock_logger.info.call_args_list[-1][0][1]
+            assert pep_precision == pytest.approx(100 * (2 / 5))
+            assert aa_precision == pytest.approx(100)
+
+            # Test un-inferred spectra
+            act_psms = [
+                get_mock_psm("PEP", ("foo", "index=1")),
+                get_mock_psm("PET", ("foo", "index=2")),
+                get_mock_psm("PEI", ("foo", "index=3")),
+                get_mock_psm("PEG", ("foo", "index=4")),
+            ]
+
+            infer_psms = [
+                get_mock_psm("PE", ("foo", "index=1")),
+                get_mock_psm("PE", ("foo", "index=2")),
+                get_mock_psm("PE", ("foo", "index=3")),
+                get_mock_psm("PE", ("foo", "index=4")),
+                get_mock_psm("PE", ("foo", "index=5")),
+            ]
+
+            runner.writer.psms = infer_psms
+            mock_index = get_mock_index(act_psms)
+            runner.log_metrics(mock_index)
+
+            pep_precision = mock_logger.info.call_args_list[-2][0][1]
+            aa_precision = mock_logger.info.call_args_list[-1][0][1]
+            assert pep_precision == pytest.approx(0)
+            assert aa_precision == pytest.approx(100)