From 5148da1c09cfc92f9b776ae8306619429d0656cb Mon Sep 17 00:00:00 2001 From: Will Fondrie Date: Fri, 3 Sep 2021 10:14:45 -0700 Subject: [PATCH] added support for subset > data size (#40) Co-authored-by: Jspaezp --- CHANGELOG.md | 5 +++++ mokapot/config.py | 6 ++++-- mokapot/model.py | 26 ++++++++++++++++++++------ tests/system_tests/test_cli.py | 2 ++ tests/unit_tests/test_model.py | 12 ++++++++++++ 5 files changed, 43 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d4edb65..d66f6727 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog for mokapot +## [0.7.4] - 2021-09-03 +### Changed +- Improved documentation and added warnings for `--subset_max_train`. Thanks + @jspaezp! + ## [0.7.3] - 2021-07-20 ### Fixed - Fixed bug where the `--keep_decoys` did not work with `--aggregate`. Also, diff --git a/mokapot/config.py b/mokapot/config.py index 8e6b95ee..04a7cbfa 100644 --- a/mokapot/config.py +++ b/mokapot/config.py @@ -216,8 +216,10 @@ def _parser(): type=int, default=None, help=( - "Use only a random subset of PSMs for training. " - "This is useful for very large datasets." + "Maximum number of PSMs to use during the training " + "of each of the cross validation folds in the model. " + "This is useful for very large datasets and will be " + "ignored if less PSMS are available." ), ) diff --git a/mokapot/model.py b/mokapot/model.py index 1eed471b..ceea21a8 100644 --- a/mokapot/model.py +++ b/mokapot/model.py @@ -266,12 +266,26 @@ def fit(self, psms): ) if self.subset_max_train is not None: - subset_idx = np.random.choice( - len(psms), self.subset_max_train, replace=False - ) - - psms = copy.copy(psms) - psms._data = psms._data.iloc[subset_idx, :] + if self.subset_max_train > len(psms): + LOGGER.warning( + "The provided subset value (%i) is larger than the number " + "of psms in the training split (%i), so it will be " + "ignored.", + self.subset_max_train, + len(psms), + ) + else: + LOGGER.info( + "Subsetting PSMs (%i) to (%i).", + len(psms), + self.subset_max_train, + ) + subset_idx = np.random.choice( + len(psms), self.subset_max_train, replace=False + ) + + psms = copy.copy(psms) + psms._data = psms._data.iloc[subset_idx, :] # Choose the initial direction start_labels, feat_pass = _get_starting_labels(psms, self) diff --git a/tests/system_tests/test_cli.py b/tests/system_tests/test_cli.py index d33da6ed..83ee743f 100644 --- a/tests/system_tests/test_cli.py +++ b/tests/system_tests/test_cli.py @@ -67,6 +67,8 @@ def test_cli_options(tmp_path, scope_files): "--max_iter", "1", "--keep_decoys", + "--subset_max_train", + "50000", ] subprocess.run(cmd, check=True) diff --git a/tests/unit_tests/test_model.py b/tests/unit_tests/test_model.py index ec519d18..06a62903 100644 --- a/tests/unit_tests/test_model.py +++ b/tests/unit_tests/test_model.py @@ -72,6 +72,18 @@ def test_model_fit(psms): assert model.is_trained +def test_model_fit_large_subset(psms): + model = mokapot.Model( + LogisticRegression(), + train_fdr=0.05, + max_iter=1, + subset_max_train=2_000_000_000, + ) + model.fit(psms) + + assert model.is_trained + + def test_model_predict(psms): """Test predictions""" model = mokapot.Model(LogisticRegression(), train_fdr=0.05, max_iter=1)