From 5148da1c09cfc92f9b776ae8306619429d0656cb Mon Sep 17 00:00:00 2001
From: Will Fondrie <fondriew@gmail.com>
Date: Fri, 3 Sep 2021 10:14:45 -0700
Subject: [PATCH] added support for subset > data size (#40)

Co-authored-by: Jspaezp <jspaezp@gmail.com>
---
 CHANGELOG.md                   |  5 +++++
 mokapot/config.py              |  6 ++++--
 mokapot/model.py               | 26 ++++++++++++++++++++------
 tests/system_tests/test_cli.py |  2 ++
 tests/unit_tests/test_model.py | 12 ++++++++++++
 5 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7d4edb65..d66f6727 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog for mokapot  
 
+## [0.7.4] - 2021-09-03
+### Changed
+- Improved documentation and added warnings for `--subset_max_train`. Thanks
+  @jspaezp!
+
 ## [0.7.3] - 2021-07-20
 ### Fixed
 - Fixed bug where the `--keep_decoys` did not work with `--aggregate`. Also,
diff --git a/mokapot/config.py b/mokapot/config.py
index 8e6b95ee..04a7cbfa 100644
--- a/mokapot/config.py
+++ b/mokapot/config.py
@@ -216,8 +216,10 @@ def _parser():
         type=int,
         default=None,
         help=(
-            "Use only a random subset of PSMs for training. "
-            "This is useful for very large datasets."
+            "Maximum number of PSMs to use during the training "
+            "of each of the cross validation folds in the model. "
+            "This is useful for very large datasets and will be "
+            "ignored if less PSMS are available."
         ),
     )
 
diff --git a/mokapot/model.py b/mokapot/model.py
index 1eed471b..ceea21a8 100644
--- a/mokapot/model.py
+++ b/mokapot/model.py
@@ -266,12 +266,26 @@ def fit(self, psms):
             )
 
         if self.subset_max_train is not None:
-            subset_idx = np.random.choice(
-                len(psms), self.subset_max_train, replace=False
-            )
-
-            psms = copy.copy(psms)
-            psms._data = psms._data.iloc[subset_idx, :]
+            if self.subset_max_train > len(psms):
+                LOGGER.warning(
+                    "The provided subset value (%i) is larger than the number "
+                    "of psms in the training split (%i), so it will be "
+                    "ignored.",
+                    self.subset_max_train,
+                    len(psms),
+                )
+            else:
+                LOGGER.info(
+                    "Subsetting PSMs (%i) to (%i).",
+                    len(psms),
+                    self.subset_max_train,
+                )
+                subset_idx = np.random.choice(
+                    len(psms), self.subset_max_train, replace=False
+                )
+
+                psms = copy.copy(psms)
+                psms._data = psms._data.iloc[subset_idx, :]
 
         # Choose the initial direction
         start_labels, feat_pass = _get_starting_labels(psms, self)
diff --git a/tests/system_tests/test_cli.py b/tests/system_tests/test_cli.py
index d33da6ed..83ee743f 100644
--- a/tests/system_tests/test_cli.py
+++ b/tests/system_tests/test_cli.py
@@ -67,6 +67,8 @@ def test_cli_options(tmp_path, scope_files):
         "--max_iter",
         "1",
         "--keep_decoys",
+        "--subset_max_train",
+        "50000",
     ]
 
     subprocess.run(cmd, check=True)
diff --git a/tests/unit_tests/test_model.py b/tests/unit_tests/test_model.py
index ec519d18..06a62903 100644
--- a/tests/unit_tests/test_model.py
+++ b/tests/unit_tests/test_model.py
@@ -72,6 +72,18 @@ def test_model_fit(psms):
     assert model.is_trained
 
 
+def test_model_fit_large_subset(psms):
+    model = mokapot.Model(
+        LogisticRegression(),
+        train_fdr=0.05,
+        max_iter=1,
+        subset_max_train=2_000_000_000,
+    )
+    model.fit(psms)
+
+    assert model.is_trained
+
+
 def test_model_predict(psms):
     """Test predictions"""
     model = mokapot.Model(LogisticRegression(), train_fdr=0.05, max_iter=1)