From 10559e4652431e50d77b78704b5215659bc281fa Mon Sep 17 00:00:00 2001 From: Will Fondrie Date: Wed, 20 Jul 2022 10:23:51 -0700 Subject: [PATCH] Fixed grouped scores and added test (#65) * Fixed grouped scores and added test * Updated changelog --- CHANGELOG.md | 4 +++ mokapot/confidence.py | 11 +++++---- tests/unit_tests/test_confidence.py | 38 +++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2030c9a2..588e4084 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog for mokapot +## [0.8.3] - 2022-07-20 +### Fixed +- Fixed the reported mokapot score when group FDR is used. + ## [0.8.2] - 2022-07-18 ### Added - `mokapot.Model()` objects now recored the CV fold that they were fit on. diff --git a/mokapot/confidence.py b/mokapot/confidence.py index c8935cbd..e72cecdf 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -63,9 +63,10 @@ def __init__(self, psms, scores, desc=True, eval_fdr=0.01): group_psms = copy.copy(psms) self.group_column = group_psms._group_column group_psms._group_column = None - scores = scores * (desc * 2 - 1) - # Do TDC + # Do TDC to eliminate multiples PSMs for a spectrum that may occur + # in different groups. + keep = "last" if desc else "first" scores = ( pd.Series(scores, index=psms._data.index) .sample(frac=1) @@ -74,7 +75,7 @@ def __init__(self, psms, scores, desc=True, eval_fdr=0.01): idx = ( psms.data.loc[scores.index, :] - .drop_duplicates(psms._spectrum_columns, keep="last") + .drop_duplicates(psms._spectrum_columns, keep=keep) .index ) @@ -84,9 +85,9 @@ def __init__(self, psms, scores, desc=True, eval_fdr=0.01): group_psms._data = None tdc_winners = group_df.index.intersection(idx) group_psms._data = group_df.loc[tdc_winners, :] - group_scores = scores.loc[group_psms._data.index].values + 1 + group_scores = scores.loc[group_psms._data.index].values res = group_psms.assign_confidence( - group_scores * (2 * desc - 1), desc=desc, eval_fdr=eval_fdr + group_scores, desc=desc, eval_fdr=eval_fdr ) self._group_confidence_estimates[group] = res diff --git a/tests/unit_tests/test_confidence.py b/tests/unit_tests/test_confidence.py index e69de29b..0be3fcdb 100644 --- a/tests/unit_tests/test_confidence.py +++ b/tests/unit_tests/test_confidence.py @@ -0,0 +1,38 @@ +"""Test that Confidence classes are working correctly""" +import pytest +import numpy as np +import pandas as pd +from mokapot import LinearPsmDataset + + +def test_one_group(psm_df_1000): + """Test that one group is equivalent to no group.""" + psm_data, _ = psm_df_1000 + psm_data["group"] = 0 + + psms = LinearPsmDataset( + psms=psm_data, + target_column="target", + spectrum_columns="spectrum", + peptide_column="peptide", + feature_columns="score", + filename_column="filename", + scan_column="spectrum", + calcmass_column="calcmass", + expmass_column="expmass", + rt_column="ret_time", + charge_column="charge", + group_column="group", + copy_data=True, + ) + + np.random.seed(42) + grouped = psms.assign_confidence() + scores1 = grouped.group_confidence_estimates[0].psms["mokapot score"] + + np.random.seed(42) + psms._group_column = None + ungrouped = psms.assign_confidence() + scores2 = ungrouped.psms["mokapot score"] + + pd.testing.assert_series_equal(scores1, scores2)