From 7ec31be0611808c3be41419711bd368ba38b6eae Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Thu, 6 Apr 2023 19:16:39 +0200 Subject: [PATCH 01/30] Upgrade depthcharge (#160) * Upgrade depthcharge * Update CHANGELOG.md --- CHANGELOG.md | 1 + pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8fb8c611..d703e9da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Spectra are correctly matched to their input peak file when analyzing multiple files simultaneously. - The score of the stop token is taken into account when calculating the predicted peptide score. - Peptides with incorrect N-terminal modifications (multiple or internal positions) are no longer predicted. +- Upgrade to Depthcharge v0.2.0 to fix sinusoidal encoding. ## [3.2.0] - 2022-11-18 diff --git a/pyproject.toml b/pyproject.toml index db0fbe35..1b14d1e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ requires-python = ">=3.8" dependencies = [ "appdirs", "click", - "depthcharge-ms>=0.1.0,<0.2.0", + "depthcharge-ms>=0.2.0", "natsort", "numpy", "pandas", From 59974e36ceeeffedaeeeac6b3be590ccb1b33b0b Mon Sep 17 00:00:00 2001 From: Melih Yilmaz <32707537+melihyilmaz@users.noreply.github.com> Date: Mon, 10 Apr 2023 09:57:19 -0700 Subject: [PATCH 02/30] Fix val step and add unit test (#164) --- casanovo/denovo/model.py | 34 ++++++++++++++++------------------ tests/unit_tests/test_unit.py | 13 +++++++++++++ 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 2f6e99a0..b32fbd96 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -754,19 +754,18 @@ def validation_step( # Calculate and log amino acid and peptide match evaluation metrics from # the predicted peptides. - peptides_pred_raw, _ = self.forward(batch[0], batch[1]) - # FIXME: Temporary fix to skip predictions with multiple stop tokens. - peptides_pred, peptides_true = [], [] - for peptide_pred, peptide_true in zip(peptides_pred_raw, batch[2]): - if len(peptide_pred) > 0: - if peptide_pred[0] == "$": - peptide_pred = peptide_pred[1:] # Remove stop token. - if "$" not in peptide_pred and len(peptide_pred) > 0: - peptides_pred.append(peptide_pred) - peptides_true.append(peptide_true) + predicted_peptide_seq = [] + true_peptide_seq = batch[2] + + for spectrum_preds in self.forward(batch[0], batch[1]): + for _, _, peptide_seq in spectrum_preds: + predicted_peptide_seq.append(peptide_seq) + aa_precision, _, pep_precision = evaluate.aa_match_metrics( *evaluate.aa_match_batch( - peptides_pred, peptides_true, self.decoder._peptide_mass.masses + predicted_peptide_seq, + true_peptide_seq, + self.decoder._peptide_mass.masses, ) ) log_args = dict(on_step=False, on_epoch=True, sync_dist=True) @@ -842,13 +841,12 @@ def on_validation_epoch_end(self) -> None: metrics = { "epoch": self.trainer.current_epoch, "valid": callback_metrics["CELoss"]["valid"].detach(), - "valid_aa_precision": callback_metrics["aa_precision"][ - "valid" - ].detach(), - "valid_aa_recall": callback_metrics["aa_recall"]["valid"].detach(), - "valid_pep_recall": callback_metrics["pep_recall"][ - "valid" - ].detach(), + "valid_aa_precision": callback_metrics[ + "AA precision at coverage=1" + ]["valid"].detach(), + "valid_pep_precision": callback_metrics[ + "Peptide precision at coverage=1" + ]["valid"].detach(), } self._history.append(metrics) self._log_history() diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index a7b0df59..09a5ab47 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -480,3 +480,16 @@ def test_spectrum_id_mzml(mzml_small, tmp_path): ): spectrum_id = str(filename), f"scan={scan_nr}" assert dataset.get_spectrum_id(i) == spectrum_id + + +def test_train_val_step_functions(): + """Test train and validation step functions operating on batches.""" + model = Spec2Pep(n_beams=1, residues="massivekb", min_peptide_len=4) + spectra = torch.zeros(1, 5, 2) + precursors = torch.tensor([[469.25364, 2.0, 235.63410]]) + peptides = ["PEPK"] + batch = (spectra, precursors, peptides) + + # Check if valid loss value returned + assert model.training_step(batch) > 0 + assert model.validation_step(batch) > 0 From da1d16db30d2fc3233be1fa43a4ff24c049f4391 Mon Sep 17 00:00:00 2001 From: Melih Yilmaz <32707537+melihyilmaz@users.noreply.github.com> Date: Tue, 11 Apr 2023 11:56:57 -0700 Subject: [PATCH 03/30] Add validation frequency option (#165) * Fix logging and checkpointing bug * Add option to validate every n steps --- CHANGELOG.md | 3 ++ casanovo/config.py | 1 + casanovo/config.yaml | 2 +- casanovo/denovo/model.py | 49 +++++++++++++++++---------------- casanovo/denovo/model_runner.py | 2 ++ 5 files changed, 33 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d703e9da..728a98e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,11 +12,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Included the `min_peptide_len` parameter in the configuration file to restrict predictions to peptide with a minimum length. - Export multiple PSMs per spectrum using the `top_match` parameter in the configuration file. +- `every_n_train_steps` parameter now controls the frequency of both validation epochs and model checkpointing during training. ### Changed - Calculate the amino acid scores as the average of the amino acid scores and the peptide score. - Spectra from mzML and mzXML peak files are referred to by their scan numbers in the mzTab output instead of their indexes. +- We now log steps rather than epochs as units of progress during training. +- Validation performance metrics are logged (and added to tensorboard) at the validation epoch, and training loss is logged at the end of training epoch, i.e. training and validation metrics are logged asynchronously. ### Fixed diff --git a/casanovo/config.py b/casanovo/config.py index cae91a85..4dc93c26 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -51,6 +51,7 @@ class Config: dim_intensity=int, max_length=int, n_log=int, + tb_summarywriter=str, warmup_iters=int, max_iters=int, learning_rate=float, diff --git a/casanovo/config.yaml b/casanovo/config.yaml index f4528978..0e5c6b95 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -117,7 +117,7 @@ save_model: True model_save_folder_path: "" # Set to "False" to save the PyTorch model instance save_weights_only: True -# Model checkpointing frequency in training steps +# Model validation and checkpointing frequency in training steps every_n_train_steps: 50_000 # Disable usage of a GPU (including Apple MPS): no_gpu: False diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index b32fbd96..78ec6f84 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -830,7 +830,11 @@ def on_train_epoch_end(self) -> None: Log the training loss at the end of each epoch. """ train_loss = self.trainer.callback_metrics["CELoss"]["train"].detach() - self._history[-1]["train"] = train_loss + metrics = { + "step": self.trainer.global_step, + "train": train_loss, + } + self._history.append(metrics) self._log_history() def on_validation_epoch_end(self) -> None: @@ -839,7 +843,7 @@ def on_validation_epoch_end(self) -> None: """ callback_metrics = self.trainer.callback_metrics metrics = { - "epoch": self.trainer.current_epoch, + "step": self.trainer.global_step, "valid": callback_metrics["CELoss"]["valid"].detach(), "valid_aa_precision": callback_metrics[ "AA precision at coverage=1" @@ -890,36 +894,35 @@ def _log_history(self) -> None: Write log to console, if requested. """ # Log only if all output for the current epoch is recorded. - if len(self._history) > 0 and len(self._history[-1]) == 6: - if len(self._history) == 1: - logger.info( - "Epoch\tTrain loss\tValid loss\tAA precision\tAA recall\t" - "Peptide recall" - ) - metrics = self._history[-1] - if metrics["epoch"] % self.n_log == 0: + if len(self._history) == 1: + logger.info( + "Step\tTrain loss\tValid loss\tPeptide precision\tAA precision" + ) + metrics = self._history[-1] + if len(self._history) > 0: + if metrics["step"] % self.n_log == 0: logger.info( - "%i\t%.6f\t%.6f\t%.6f\t%.6f\t%.6f", - metrics["epoch"] + 1, + "%i\t%.6f\t%.6f\t%.6f\t%.6f", + metrics["step"], metrics.get("train", np.nan), metrics.get("valid", np.nan), + metrics.get("valid_pep_precision", np.nan), metrics.get("valid_aa_precision", np.nan), - metrics.get("valid_aa_recall", np.nan), - metrics.get("valid_pep_recall", np.nan), ) if self.tb_summarywriter is not None: for descr, key in [ ("loss/train_crossentropy_loss", "train"), - ("loss/dev_crossentropy_loss", "valid"), - ("eval/dev_aa_precision", "valid_aa_precision"), - ("eval/dev_aa_recall", "valid_aa_recall"), - ("eval/dev_pep_recall", "valid_pep_recall"), + ("loss/val_crossentropy_loss", "valid"), + ("eval/val_pep_precision", "valid_pep_precision"), + ("eval/val_aa_precision", "valid_aa_precision"), ]: - self.tb_summarywriter.add_scalar( - descr, - metrics.get(key, np.nan), - metrics["epoch"] + 1, - ) + metric_value = metrics.get(key, np.nan) + if metric_value is not np.nan: + self.tb_summarywriter.add_scalar( + descr, + metric_value, + metrics["step"], + ) def configure_optimizers( self, diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index bfc4e1a9..b92dd780 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -307,10 +307,12 @@ def train( auto_select_gpus=True, callbacks=callbacks, devices=_get_devices(config["no_gpu"]), + enable_checkpointing=config["save_model"], logger=config["logger"], max_epochs=config["max_epochs"], num_sanity_val_steps=config["num_sanity_val_steps"], strategy=_get_strategy(), + val_check_interval=config["every_n_train_steps"], ) # Train the model. trainer.fit( From 5ba0275e29eba17bbb0b711b13b030d969ba47cb Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Fri, 14 Apr 2023 21:41:37 +0200 Subject: [PATCH 04/30] Minor refactoring + issue fix (#166) Fixes minor issues in #165. --- casanovo/denovo/model.py | 60 ++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 78ec6f84..8b3ae3e0 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -754,18 +754,13 @@ def validation_step( # Calculate and log amino acid and peptide match evaluation metrics from # the predicted peptides. - predicted_peptide_seq = [] - true_peptide_seq = batch[2] - + peptides_pred, peptides_true = [], batch[2] for spectrum_preds in self.forward(batch[0], batch[1]): - for _, _, peptide_seq in spectrum_preds: - predicted_peptide_seq.append(peptide_seq) - + for _, _, pred in spectrum_preds: + peptides_pred.append(pred) aa_precision, _, pep_precision = evaluate.aa_match_metrics( *evaluate.aa_match_batch( - predicted_peptide_seq, - true_peptide_seq, - self.decoder._peptide_mass.masses, + peptides_pred, peptides_true, self.decoder._peptide_mass.masses ) ) log_args = dict(on_step=False, on_epoch=True, sync_dist=True) @@ -894,35 +889,34 @@ def _log_history(self) -> None: Write log to console, if requested. """ # Log only if all output for the current epoch is recorded. + if len(self._history) == 0: + return if len(self._history) == 1: logger.info( "Step\tTrain loss\tValid loss\tPeptide precision\tAA precision" ) metrics = self._history[-1] - if len(self._history) > 0: - if metrics["step"] % self.n_log == 0: - logger.info( - "%i\t%.6f\t%.6f\t%.6f\t%.6f", - metrics["step"], - metrics.get("train", np.nan), - metrics.get("valid", np.nan), - metrics.get("valid_pep_precision", np.nan), - metrics.get("valid_aa_precision", np.nan), - ) - if self.tb_summarywriter is not None: - for descr, key in [ - ("loss/train_crossentropy_loss", "train"), - ("loss/val_crossentropy_loss", "valid"), - ("eval/val_pep_precision", "valid_pep_precision"), - ("eval/val_aa_precision", "valid_aa_precision"), - ]: - metric_value = metrics.get(key, np.nan) - if metric_value is not np.nan: - self.tb_summarywriter.add_scalar( - descr, - metric_value, - metrics["step"], - ) + if metrics["step"] % self.n_log == 0: + logger.info( + "%i\t%.6f\t%.6f\t%.6f\t%.6f", + metrics["step"], + metrics.get("train", np.nan), + metrics.get("valid", np.nan), + metrics.get("valid_pep_precision", np.nan), + metrics.get("valid_aa_precision", np.nan), + ) + if self.tb_summarywriter is not None: + for descr, key in [ + ("loss/train_crossentropy_loss", "train"), + ("loss/val_crossentropy_loss", "valid"), + ("eval/val_pep_precision", "valid_pep_precision"), + ("eval/val_aa_precision", "valid_aa_precision"), + ]: + metric_value = metrics.get(key, np.nan) + if not np.isnan(metric_value): + self.tb_summarywriter.add_scalar( + descr, metric_value, metrics["step"] + ) def configure_optimizers( self, From 868952e99feff06f59b7f5ffa2af7d41d95c339f Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Mon, 17 Apr 2023 08:39:33 +0200 Subject: [PATCH 05/30] Always use full file paths (#168) * Always use full file paths Fixes #167. * Update changelog * Formatting fix --- CHANGELOG.md | 20 +++++++++++++++----- casanovo/data/ms_io.py | 9 ++++----- casanovo/denovo/model_runner.py | 2 +- tests/unit_tests/test_unit.py | 13 +++++++++++++ 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 728a98e8..1c87c3d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,20 +6,31 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] +### Added + +- `every_n_train_steps` parameter now controls the frequency of both validation epochs and model checkpointing during training. + +### Changed + +- We now log steps rather than epochs as units of progress during training. +- Validation performance metrics are logged (and added to tensorboard) at the validation epoch, and training loss is logged at the end of training epoch, i.e. training and validation metrics are logged asynchronously. + +### Fixed + +- Upgrade to Depthcharge v0.2.0 to fix sinusoidal encoding. +- Correctly refer to input peak files by their full file path. + ## [3.3.0] - 2023-04-04 ### Added - Included the `min_peptide_len` parameter in the configuration file to restrict predictions to peptide with a minimum length. - Export multiple PSMs per spectrum using the `top_match` parameter in the configuration file. -- `every_n_train_steps` parameter now controls the frequency of both validation epochs and model checkpointing during training. ### Changed - Calculate the amino acid scores as the average of the amino acid scores and the peptide score. - Spectra from mzML and mzXML peak files are referred to by their scan numbers in the mzTab output instead of their indexes. -- We now log steps rather than epochs as units of progress during training. -- Validation performance metrics are logged (and added to tensorboard) at the validation epoch, and training loss is logged at the end of training epoch, i.e. training and validation metrics are logged asynchronously. ### Fixed @@ -27,7 +38,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Spectra are correctly matched to their input peak file when analyzing multiple files simultaneously. - The score of the stop token is taken into account when calculating the predicted peptide score. - Peptides with incorrect N-terminal modifications (multiple or internal positions) are no longer predicted. -- Upgrade to Depthcharge v0.2.0 to fix sinusoidal encoding. ## [3.2.0] - 2022-11-18 @@ -164,7 +174,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Initial Casanovo version. [Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v3.3.0...HEAD -[3.2.0]: https://github.com/Noble-Lab/casanovo/compare/v3.2.0...v3.3.0 +[3.3.0]: https://github.com/Noble-Lab/casanovo/compare/v3.2.0...v3.3.0 [3.2.0]: https://github.com/Noble-Lab/casanovo/compare/v3.1.0...v3.2.0 [3.1.0]: https://github.com/Noble-Lab/casanovo/compare/v3.0.0...v3.1.0 [3.0.0]: https://github.com/Noble-Lab/casanovo/compare/v2.1.1...v3.0.0 diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index 6dfda598..e3b3a8d6 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -135,11 +135,9 @@ def set_ms_run(self, peak_filenames: List[str]) -> None: The input peak file name(s). """ for i, filename in enumerate(natsort.natsorted(peak_filenames), 1): + filename = os.path.abspath(filename) self.metadata.append( - ( - f"ms_run[{i}]-location", - Path(os.path.abspath(filename)).as_uri(), - ), + (f"ms_run[{i}]-location", Path(filename).as_uri()), ) self._run_map[filename] = i @@ -180,6 +178,7 @@ def save(self) -> None: for i, psm in enumerate( natsort.natsorted(self.psms, key=operator.itemgetter(1)), 1 ): + filename, idx = os.path.abspath(psm[1][0]), psm[1][1] writer.writerow( [ "PSM", @@ -200,7 +199,7 @@ def save(self) -> None: psm[3], # charge psm[4], # exp_mass_to_charge psm[5], # calc_mass_to_charge - f"ms_run[{self._run_map[psm[1][0]]}]:{psm[1][1]}", + f"ms_run[{self._run_map[filename]}]:{idx}", "null", # pre "null", # post "null", # start diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index b92dd780..fb5deeba 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -346,7 +346,7 @@ def _get_peak_filenames( path = os.path.expanduser(path) path = os.path.expandvars(path) return [ - fn + os.path.abspath(fn) for fn in glob.glob(path, recursive=True) if os.path.splitext(fn.lower())[1] in supported_ext ] diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 09a5ab47..bc0509bd 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -13,6 +13,7 @@ from casanovo import casanovo from casanovo import utils +from casanovo.data import ms_io from casanovo.data.datasets import SpectrumDataset, AnnotatedSpectrumDataset from casanovo.denovo.evaluate import aa_match_batch, aa_match_metrics from casanovo.denovo.model import Spec2Pep, _aa_pep_score @@ -493,3 +494,15 @@ def test_train_val_step_functions(): # Check if valid loss value returned assert model.training_step(batch) > 0 assert model.validation_step(batch) > 0 + + +def test_run_map(mgf_small): + out_writer = ms_io.MztabWriter("dummy.mztab") + # Set peak file by base file name only. + out_writer.set_ms_run([os.path.basename(mgf_small.name)]) + assert os.path.basename(mgf_small.name) not in out_writer._run_map + assert os.path.abspath(mgf_small.name) in out_writer._run_map + # Set peak file by full path. + out_writer.set_ms_run([os.path.abspath(mgf_small.name)]) + assert os.path.basename(mgf_small.name) not in out_writer._run_map + assert os.path.abspath(mgf_small.name) in out_writer._run_map From effc955ab488212f5f95ae02a896c66478535704 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Mon, 17 Apr 2023 20:49:30 +0200 Subject: [PATCH 06/30] Only split off known extensions from output filename (#171) * Only split off extension if it's mzTab * Also check for .log extension * Update output format in help message --- casanovo/casanovo.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index d3e42642..72b02057 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -67,7 +67,7 @@ @click.option( "--output", help="The base output file name to store logging (extension: .log) and " - "(optionally) prediction results (extension: .csv).", + "(optionally) prediction results (extension: .mztab).", type=click.Path(dir_okay=False), ) def main( @@ -96,7 +96,8 @@ def main( f"casanovo_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}", ) else: - output = os.path.splitext(os.path.abspath(output))[0] + basename, ext = os.path.splitext(os.path.abspath(output)) + output = basename if ext.lower() in (".log", ".mztab") else output # Configure logging. logging.captureWarnings(True) From 6299bd26b44d0d4c9b3c0bd20d23bcf595a4b0c2 Mon Sep 17 00:00:00 2001 From: Will Fondrie Date: Wed, 10 May 2023 15:11:33 -0700 Subject: [PATCH 07/30] Fix CPU bug, overhaul model runner, and update to lightning >=2.0 (#176) * Overhaul runner * Update linting to only happen once * Fix linting error * Specify utf-8 encoding * Specify utf-8 encoding only for default config * Skip weights tests for now * Update skipping API test * Revert accidental max_epochs change * msg -> reason for pytest.mark.skip * Wout's suggestions and more tests * Remove encoding * Specify device type when weight loading * Fix lint * Capture init params and figure out device automagically * Add runner tests * Fix bug and limit saved models * Support old weights too * Remove every_n_train_steps from checkpoint --------- Co-authored-by: melihyilmaz --- .github/workflows/{black.yml => lint.yml} | 6 +- .github/workflows/tests.yml | 2 +- .gitignore | 2 + casanovo/casanovo.py | 30 +- casanovo/config.py | 12 +- casanovo/config.yaml | 17 +- casanovo/denovo/__init__.py | 1 + casanovo/denovo/dataloaders.py | 30 +- casanovo/denovo/model.py | 33 +- casanovo/denovo/model_runner.py | 692 +++++++++++----------- pyproject.toml | 4 +- tests/conftest.py | 23 + tests/test_integration.py | 82 ++- tests/unit_tests/test_config.py | 4 +- tests/unit_tests/test_runner.py | 94 +++ tests/unit_tests/test_unit.py | 1 + 16 files changed, 614 insertions(+), 419 deletions(-) rename .github/workflows/{black.yml => lint.yml} (87%) create mode 100644 tests/unit_tests/test_runner.py diff --git a/.github/workflows/black.yml b/.github/workflows/lint.yml similarity index 87% rename from .github/workflows/black.yml rename to .github/workflows/lint.yml index cec52e2b..ce576f53 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/lint.yml @@ -1,6 +1,10 @@ name: Lint -on: [push, pull_request] +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] jobs: lint: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1d7fe2f7..ea5a1eb8 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -14,7 +14,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, windows-latest] + os: [ubuntu-latest, windows-latest, macos-latest] steps: - uses: actions/checkout@v2 diff --git a/.gitignore b/.gitignore index 32202470..aa8178a5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ # Test stuff: test_path/ +lightning_logs/ +envs/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 72b02057..f07d7b43 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -18,12 +18,12 @@ import torch import tqdm import yaml -from pytorch_lightning.lite import LightningLite +from lightning.pytorch import seed_everything from . import __version__ from . import utils from .data import ms_io -from .denovo import model_runner +from .denovo import ModelRunner from .config import Config logger = logging.getLogger("casanovo") @@ -52,11 +52,13 @@ required=True, help="The file path with peak files for predicting peptide sequences or " "training Casanovo.", + multiple=True, ) @click.option( "--peak_path_val", help="The file path with peak files to be used as validation data during " "training.", + multiple=True, ) @click.option( "--config", @@ -127,7 +129,7 @@ def main( # Read parameters from the config file. config = Config(config) - LightningLite.seed_everything(seed=config["random_seed"], workers=True) + seed_everything(seed=config["random_seed"], workers=True) # Download model weights if these were not specified (except when training). if model is None and mode != "train": @@ -159,18 +161,16 @@ def main( logger.debug("%s = %s", str(key), str(value)) # Run Casanovo in the specified mode. - if mode == "denovo": - logger.info("Predict peptide sequences with Casanovo.") - writer = ms_io.MztabWriter(f"{output}.mztab") - writer.set_metadata(config, model=model, config_filename=config.file) - model_runner.predict(peak_path, model, config, writer) - writer.save() - elif mode == "eval": - logger.info("Evaluate a trained Casanovo model.") - model_runner.evaluate(peak_path, model, config) - elif mode == "train": - logger.info("Train the Casanovo model.") - model_runner.train(peak_path, peak_path_val, model, config) + with ModelRunner(config, model) as model_runner: + if mode == "denovo": + logger.info("Predict peptide sequences with Casanovo.") + model_runner.predict(peak_path, output) + elif mode == "eval": + logger.info("Evaluate a trained Casanovo model.") + model_runner.evaluate(peak_path) + elif mode == "train": + logger.info("Train the Casanovo model.") + model_runner.train(peak_path, peak_path_val) def _get_model_weights() -> str: diff --git a/casanovo/config.py b/casanovo/config.py index 4dc93c26..fbbf2e16 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -63,11 +63,11 @@ class Config: max_epochs=int, num_sanity_val_steps=int, train_from_scratch=bool, - save_model=bool, + save_top_k=int, model_save_folder_path=str, - save_weights_only=bool, every_n_train_steps=int, - no_gpu=bool, + accelerator=str, + devices=int, ) def __init__(self, config_file: Optional[str] = None): @@ -86,13 +86,7 @@ def __init__(self, config_file: Optional[str] = None): for key, val in self._config_types.items(): self.validate_param(key, val) - # Add extra configuration options and scale by the number of GPUs. - n_gpus = 0 if self["no_gpu"] else torch.cuda.device_count() self._params["n_workers"] = utils.n_workers() - if n_gpus > 1: - self._params["train_batch_size"] = ( - self["train_batch_size"] // n_gpus - ) def __getitem__(self, param: str) -> Union[int, bool, str, Tuple, Dict]: """Retrieve a parameter""" diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 0e5c6b95..7b8379ab 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -111,13 +111,18 @@ max_epochs: 30 num_sanity_val_steps: 0 # Set to "False" to further train a pre-trained Casanovo model train_from_scratch: True -# Save model checkpoints during training -save_model: True +# Save the top k model checkpoints during training. -1 saves all and +# leaving this field empty saves none. +save_top_k: 5 # Path to saved checkpoints model_save_folder_path: "" -# Set to "False" to save the PyTorch model instance -save_weights_only: True # Model validation and checkpointing frequency in training steps every_n_train_steps: 50_000 -# Disable usage of a GPU (including Apple MPS): -no_gpu: False +# The hardware accelerator to use. Must be one of: +# "cpu", "gpu", "tpu", "ipu", "hpu", "mps", or "auto" +accelerator: "auto" +# The devices to use. Can be set to a positive number int, +# or the value -1 to indicate all available devices should be used, +# If left empty, the appropriate number will be automatically +# selected for automatic selected on the chosen accelerator. +devices: diff --git a/casanovo/denovo/__init__.py b/casanovo/denovo/__init__.py index e69de29b..da194f1b 100644 --- a/casanovo/denovo/__init__.py +++ b/casanovo/denovo/__init__.py @@ -0,0 +1 @@ +from .model_runner import ModelRunner diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 2ee2f8f5..7ab78355 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -3,8 +3,8 @@ import os from typing import List, Optional, Tuple +import lightning.pytorch as pl import numpy as np -import pytorch_lightning as pl import torch from depthcharge.data import AnnotatedSpectrumIndex @@ -23,8 +23,10 @@ class DeNovoDataModule(pl.LightningDataModule): The spectrum index file corresponding to the validation data. test_index : Optional[AnnotatedSpectrumIndex] The spectrum index file corresponding to the testing data. - batch_size : int - The batch size to use for training and evaluating. + train_batch_size : int + The batch size to use for training. + eval_batch_size : int + The batch size to use for inference. n_peaks : Optional[int] The number of top-n most intense peaks to keep in each spectrum. `None` retains all peaks. @@ -52,7 +54,8 @@ def __init__( train_index: Optional[AnnotatedSpectrumIndex] = None, valid_index: Optional[AnnotatedSpectrumIndex] = None, test_index: Optional[AnnotatedSpectrumIndex] = None, - batch_size: int = 128, + train_batch_size: int = 128, + eval_batch_size: int = 1028, n_peaks: Optional[int] = 150, min_mz: float = 50.0, max_mz: float = 2500.0, @@ -65,7 +68,8 @@ def __init__( self.train_index = train_index self.valid_index = valid_index self.test_index = test_index - self.batch_size = batch_size + self.train_batch_size = train_batch_size + self.eval_batch_size = eval_batch_size self.n_peaks = n_peaks self.min_mz = min_mz self.max_mz = max_mz @@ -119,7 +123,9 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: self.test_dataset = make_dataset(self.test_index) def _make_loader( - self, dataset: torch.utils.data.Dataset + self, + dataset: torch.utils.data.Dataset, + batch_size: int, ) -> torch.utils.data.DataLoader: """ Create a PyTorch DataLoader. @@ -128,6 +134,8 @@ def _make_loader( ---------- dataset : torch.utils.data.Dataset A PyTorch Dataset. + batch_size : int + The batch size to use. Returns ------- @@ -136,7 +144,7 @@ def _make_loader( """ return torch.utils.data.DataLoader( dataset, - batch_size=self.batch_size, + batch_size=batch_size, collate_fn=prepare_batch, pin_memory=True, num_workers=self.n_workers, @@ -144,19 +152,19 @@ def _make_loader( def train_dataloader(self) -> torch.utils.data.DataLoader: """Get the training DataLoader.""" - return self._make_loader(self.train_dataset) + return self._make_loader(self.train_dataset, self.train_batch_size) def val_dataloader(self) -> torch.utils.data.DataLoader: """Get the validation DataLoader.""" - return self._make_loader(self.valid_dataset) + return self._make_loader(self.valid_dataset, self.eval_batch_size) def test_dataloader(self) -> torch.utils.data.DataLoader: """Get the test DataLoader.""" - return self._make_loader(self.test_dataset) + return self._make_loader(self.test_dataset, self.eval_batch_size) def predict_dataloader(self) -> torch.utils.data.DataLoader: """Get the predict DataLoader.""" - return self._make_loader(self.test_dataset) + return self._make_loader(self.test_dataset, self.eval_batch_size) def prepare_batch( diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 8b3ae3e0..1105a9e7 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -7,9 +7,9 @@ import depthcharge.masses import einops -import numpy as np -import pytorch_lightning as pl import torch +import numpy as np +import lightning.pytorch as pl from torch.utils.tensorboard import SummaryWriter from depthcharge.components import ModelMixin, PeptideDecoder, SpectrumEncoder @@ -114,6 +114,7 @@ def __init__( **kwargs: Dict, ): super().__init__() + self.save_hyperparameters() # Build the model. if custom_encoder is not None: @@ -724,8 +725,8 @@ def training_step( pred = pred[:, :-1, :].reshape(-1, self.decoder.vocab_size + 1) loss = self.celoss(pred, truth.flatten()) self.log( - "CELoss", - {mode: loss.detach()}, + f"{mode}_CELoss", + loss.detach(), on_step=False, on_epoch=True, sync_dist=True, @@ -766,12 +767,10 @@ def validation_step( log_args = dict(on_step=False, on_epoch=True, sync_dist=True) self.log( "Peptide precision at coverage=1", - {"valid": pep_precision}, + pep_precision, **log_args, ) - self.log( - "AA precision at coverage=1", {"valid": aa_precision}, **log_args - ) + self.log("AA precision at coverage=1", aa_precision, **log_args) return loss @@ -824,7 +823,7 @@ def on_train_epoch_end(self) -> None: """ Log the training loss at the end of each epoch. """ - train_loss = self.trainer.callback_metrics["CELoss"]["train"].detach() + train_loss = self.trainer.callback_metrics["train_CELoss"].detach() metrics = { "step": self.trainer.global_step, "train": train_loss, @@ -839,19 +838,21 @@ def on_validation_epoch_end(self) -> None: callback_metrics = self.trainer.callback_metrics metrics = { "step": self.trainer.global_step, - "valid": callback_metrics["CELoss"]["valid"].detach(), + "valid": callback_metrics["valid_CELoss"].detach(), "valid_aa_precision": callback_metrics[ "AA precision at coverage=1" - ]["valid"].detach(), + ].detach(), "valid_pep_precision": callback_metrics[ "Peptide precision at coverage=1" - ]["valid"].detach(), + ].detach(), } self._history.append(metrics) self._log_history() - def on_predict_epoch_end( - self, results: List[List[Tuple[np.ndarray, List[str], torch.Tensor]]] + def on_predict_batch_end( + self, + outputs: List[Tuple[np.ndarray, List[str], torch.Tensor]], + *args, ) -> None: """ Write the predicted peptide sequences and amino acid scores to the @@ -867,9 +868,7 @@ def on_predict_epoch_end( peptide, peptide_score, aa_scores, - ) in itertools.chain.from_iterable( - itertools.chain.from_iterable(results) - ): + ) in outputs: if len(peptide) == 0: continue self.out_writer.psms.append( diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index fb5deeba..2c22bd62 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -6,15 +6,17 @@ import os import tempfile import uuid +from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Union +import lightning.pytorch as pl import numpy as np -import pytorch_lightning as pl import torch from depthcharge.data import AnnotatedSpectrumIndex, SpectrumIndex -from pytorch_lightning.strategies import DDPStrategy +from lightning.pytorch.strategies import DDPStrategy from .. import utils +from ..config import Config from ..data import ms_io from ..denovo.dataloaders import DeNovoDataModule from ..denovo.model import Spec2Pep @@ -23,307 +25,360 @@ logger = logging.getLogger("casanovo") -def predict( - peak_path: str, - model_filename: str, - config: Dict[str, Any], - out_writer: ms_io.MztabWriter, -) -> None: - """ - Predict peptide sequences with a trained Casanovo model. - - Parameters - ---------- - peak_path : str - The path with peak files for predicting peptide sequences. - model_filename : str - The file name of the model weights (.ckpt file). - config : Dict[str, Any] - The configuration options. - out_writer : ms_io.MztabWriter - The mzTab writer to export the prediction results. - """ - _execute_existing(peak_path, model_filename, config, False, out_writer) - - -def evaluate( - peak_path: str, model_filename: str, config: Dict[str, Any] -) -> None: - """ - Evaluate peptide sequence predictions from a trained Casanovo model. +class ModelRunner: + """A class to run Casanovo models. Parameters ---------- - peak_path : str - The path with peak files for predicting peptide sequences. - model_filename : str - The file name of the model weights (.ckpt file). - config : Dict[str, Any] - The configuration options. + config : Config object + The casanovo configuration. + model_filename : str, optional + The model filename is required for eval and de novo modes, + but not for training a model from scratch. """ - _execute_existing(peak_path, model_filename, config, True) + def __init__( + self, + config: Config, + model_filename: Optional[str] = None, + ) -> None: + """Initialize a ModelRunner""" + self.config = config + self.model_filename = model_filename + + # Initialized later: + self.tmp_dir = None + self.trainer = None + self.model = None + self.loaders = None + self.writer = None + + # Configure checkpoints. + if config.save_top_k is not None: + self.callbacks = [ + pl.callbacks.ModelCheckpoint( + dirpath=config.model_save_folder_path, + monitor="valid_CELoss", + mode="min", + save_top_k=config.save_top_k, + ) + ] + else: + self.callbacks = None + + def __enter__(self): + """Enter the context manager""" + self.tmp_dir = tempfile.TemporaryDirectory() + return self + + def __exit__(self, exc_type, exc_value, traceback): + """Cleanup on exit""" + self.tmp_dir.cleanup() + self.tmp_dir = None + if self.writer is not None: + self.writer.save() + + def train( + self, + train_peak_path: Iterable[str], + valid_peak_path: Iterable[str], + ) -> None: + """Train the Casanovo model. + + Parameters + ---------- + train_peak_path : iterable of str + The path to the MS data files for training. + valid_peak_path : iterable of str + The path to the MS data files for validation. + + Returns + ------- + self + """ + self.initialize_trainer(train=True) + self.initialize_model(train=True) + + train_index = self._get_index(train_peak_path, True, "training") + valid_index = self._get_index(valid_peak_path, True, "validation") + self.initialize_data_module(train_index, valid_index) + self.loaders.setup() + + self.trainer.fit( + self.model, + self.loaders.train_dataloader(), + self.loaders.val_dataloader(), + ) -def _execute_existing( - peak_path: str, - model_filename: str, - config: Dict[str, Any], - annotated: bool, - out_writer: Optional[ms_io.MztabWriter] = None, -) -> None: - """ - Predict peptide sequences with a trained Casanovo model with/without - evaluation. + def evaluate(self, peak_path: Iterable[str]) -> None: + """Evaluate peptide sequence preditions from a trained Casanovo model. + + Parameters + ---------- + peak_path : iterable of str + The path with MS data files for predicting peptide sequences. + + Returns + ------- + self + """ + self.initialize_trainer(train=False) + self.initialize_model(train=False) + + test_index = self._get_index(peak_path, True, "evaluation") + self.initialize_data_module(test_index=test_index) + self.loaders.setup(stage="test", annotated=True) + + self.trainer.validate(self.model, self.loaders.test_dataloader()) + + def predict(self, peak_path: Iterable[str], output: str) -> None: + """Predict peptide sequences with a trained Casanovo model. + + Parameters + ---------- + peak_path : iterable of str + The path with the MS data files for predicting peptide sequences. + output : str + Where should the output be saved? + + Returns + ------- + self + """ + self.writer = ms_io.MztabWriter(f"{output}.mztab") + self.writer.set_metadata( + self.config, + model=str(self.model_filename), + config_filename=self.config.file, + ) - Parameters - ---------- - peak_path : str - The path with peak files for predicting peptide sequences. - model_filename : str - The file name of the model weights (.ckpt file). - config : Dict[str, Any] - The configuration options. - annotated : bool - Whether the input peak files are annotated (execute in evaluation mode) - or not (execute in prediction mode only). - out_writer : Optional[ms_io.MztabWriter] - The mzTab writer to export the prediction results. - """ - # Load the trained model. - if not os.path.isfile(model_filename): - logger.error( - "Could not find the trained model weights at file %s", - model_filename, + self.initialize_trainer(train=False) + self.initialize_model(train=False) + self.model.out_writer = self.writer + + test_index = self._get_index(peak_path, False, "") + self.writer.set_ms_run(test_index.ms_files) + self.initialize_data_module(test_index=test_index) + self.loaders.setup(stage="test", annotated=False) + self.trainer.predict(self.model, self.loaders.test_dataloader()) + + def initialize_trainer(self, train: bool) -> None: + """Initialize the lightning Trainer. + + Parameters + ---------- + train : bool + Determines whether to set the trainer up for model training + or evaluation / inference. + """ + trainer_cfg = dict( + accelerator=self.config.accelerator, + devices=1, + logger=self.config.logger, ) - raise FileNotFoundError("Could not find the trained model weights") - model = Spec2Pep().load_from_checkpoint( - model_filename, - dim_model=config["dim_model"], - n_head=config["n_head"], - dim_feedforward=config["dim_feedforward"], - n_layers=config["n_layers"], - dropout=config["dropout"], - dim_intensity=config["dim_intensity"], - custom_encoder=config["custom_encoder"], - max_length=config["max_length"], - residues=config["residues"], - max_charge=config["max_charge"], - precursor_mass_tol=config["precursor_mass_tol"], - isotope_error_range=config["isotope_error_range"], - min_peptide_len=config["min_peptide_len"], - n_beams=config["n_beams"], - top_match=config["top_match"], - n_log=config["n_log"], - out_writer=out_writer, - ) - # Read the MS/MS spectra for which to predict peptide sequences. - if annotated: - peak_ext = (".mgf", ".h5", ".hdf5") - else: - peak_ext = (".mgf", ".mzml", ".mzxml", ".h5", ".hdf5") - if len(peak_filenames := _get_peak_filenames(peak_path, peak_ext)) == 0: - logger.error("Could not find peak files from %s", peak_path) - raise FileNotFoundError("Could not find peak files") - else: - out_writer.set_ms_run(peak_filenames) - peak_is_index = any( - [os.path.splitext(fn)[1] in (".h5", ".hdf5") for fn in peak_filenames] - ) - if peak_is_index and len(peak_filenames) > 1: - logger.error("Multiple HDF5 spectrum indexes specified") - raise ValueError("Multiple HDF5 spectrum indexes specified") - tmp_dir = tempfile.TemporaryDirectory() - if peak_is_index: - idx_filename, peak_filenames = peak_filenames[0], None - else: - idx_filename = os.path.join(tmp_dir.name, f"{uuid.uuid4().hex}.hdf5") - SpectrumIdx = AnnotatedSpectrumIndex if annotated else SpectrumIndex - valid_charge = np.arange(1, config["max_charge"] + 1) - index = SpectrumIdx( - idx_filename, peak_filenames, valid_charge=valid_charge - ) - # Initialize the data loader. - loaders = DeNovoDataModule( - test_index=index, - n_peaks=config["n_peaks"], - min_mz=config["min_mz"], - max_mz=config["max_mz"], - min_intensity=config["min_intensity"], - remove_precursor_tol=config["remove_precursor_tol"], - n_workers=config["n_workers"], - batch_size=config["predict_batch_size"], - ) - loaders.setup(stage="test", annotated=annotated) - - # Create the Trainer object. - trainer = pl.Trainer( - accelerator="auto", - auto_select_gpus=True, - devices=_get_devices(config["no_gpu"]), - logger=config["logger"], - max_epochs=config["max_epochs"], - num_sanity_val_steps=config["num_sanity_val_steps"], - strategy=_get_strategy(), - ) - # Run the model with/without validation. - run_trainer = trainer.validate if annotated else trainer.predict - run_trainer(model, loaders.test_dataloader()) - # Clean up temporary files. - tmp_dir.cleanup() - - -def train( - peak_path: str, - peak_path_val: str, - model_filename: str, - config: Dict[str, Any], -) -> None: - """ - Train a Casanovo model. - The model can be trained from scratch or by continuing training an existing - model. + if train: + if self.config.devices is None: + devices = "auto" + else: + devices = self.config.devices + + additional_cfg = dict( + devices=devices, + callbacks=self.callbacks, + enable_checkpointing=self.config.save_top_k is not None, + max_epochs=self.config.max_epochs, + num_sanity_val_steps=self.config.num_sanity_val_steps, + strategy=self._get_strategy(), + val_check_interval=self.config.every_n_train_steps, + ) + trainer_cfg.update(additional_cfg) + + self.trainer = pl.Trainer(**trainer_cfg) + + def initialize_model(self, train: bool) -> None: + """Initialize the Casanovo model. + + Parameters + ---------- + train : bool + Determines whether to set the model up for model training + or evaluation / inference. + """ + model_params = dict( + dim_model=self.config.dim_model, + n_head=self.config.n_head, + dim_feedforward=self.config.dim_feedforward, + n_layers=self.config.n_layers, + dropout=self.config.dropout, + dim_intensity=self.config.dim_intensity, + custom_encoder=self.config.custom_encoder, + max_length=self.config.max_length, + residues=self.config.residues, + max_charge=self.config.max_charge, + precursor_mass_tol=self.config.precursor_mass_tol, + isotope_error_range=self.config.isotope_error_range, + n_beams=self.config.n_beams, + top_match=self.config.top_match, + n_log=self.config.n_log, + tb_summarywriter=self.config.tb_summarywriter, + warmup_iters=self.config.warmup_iters, + max_iters=self.config.max_iters, + lr=self.config.learning_rate, + weight_decay=self.config.weight_decay, + out_writer=self.writer, + ) - Parameters - ---------- - peak_path : str - The path with peak files to be used as training data. - peak_path_val : str - The path with peak files to be used as validation data. - model_filename : str - The file name of the model weights (.ckpt file). - config : Dict[str, Any] - The configuration options. - """ - # Read the MS/MS spectra to use for training and validation. - ext = (".mgf", ".h5", ".hdf5") - if len(train_filenames := _get_peak_filenames(peak_path, ext)) == 0: - logger.error("Could not find training peak files from %s", peak_path) - raise FileNotFoundError("Could not find training peak files") - train_is_index = any( - [os.path.splitext(fn)[1] in (".h5", ".hdf5") for fn in train_filenames] - ) - if train_is_index and len(train_filenames) > 1: - logger.error("Multiple training HDF5 spectrum indexes specified") - raise ValueError("Multiple training HDF5 spectrum indexes specified") - if ( - peak_path_val is None - or len(val_filenames := _get_peak_filenames(peak_path_val, ext)) == 0 - ): - logger.error( - "Could not find validation peak files from %s", peak_path_val + from_scratch = ( + self.config.train_from_scratch, + self.model_filename is None, ) - raise FileNotFoundError("Could not find validation peak files") - val_is_index = any( - [os.path.splitext(fn)[1] in (".h5", ".hdf5") for fn in val_filenames] - ) - if val_is_index and len(val_filenames) > 1: - logger.error("Multiple validation HDF5 spectrum indexes specified") - raise ValueError("Multiple validation HDF5 spectrum indexes specified") - tmp_dir = tempfile.TemporaryDirectory() - if train_is_index: - train_idx_fn, train_filenames = train_filenames[0], None - else: - train_idx_fn = os.path.join(tmp_dir.name, f"{uuid.uuid4().hex}.hdf5") - valid_charge = np.arange(1, config["max_charge"] + 1) - train_index = AnnotatedSpectrumIndex( - train_idx_fn, train_filenames, valid_charge=valid_charge - ) - if val_is_index: - val_idx_fn, val_filenames = val_filenames[0], None - else: - val_idx_fn = os.path.join(tmp_dir.name, f"{uuid.uuid4().hex}.hdf5") - val_index = AnnotatedSpectrumIndex( - val_idx_fn, val_filenames, valid_charge=valid_charge - ) - # Initialize the data loaders. - dataloader_params = dict( - batch_size=config["train_batch_size"], - n_peaks=config["n_peaks"], - min_mz=config["min_mz"], - max_mz=config["max_mz"], - min_intensity=config["min_intensity"], - remove_precursor_tol=config["remove_precursor_tol"], - n_workers=config["n_workers"], - ) - train_loader = DeNovoDataModule( - train_index=train_index, **dataloader_params - ) - train_loader.setup() - val_loader = DeNovoDataModule(valid_index=val_index, **dataloader_params) - val_loader.setup() - # Initialize the model. - model_params = dict( - dim_model=config["dim_model"], - n_head=config["n_head"], - dim_feedforward=config["dim_feedforward"], - n_layers=config["n_layers"], - dropout=config["dropout"], - dim_intensity=config["dim_intensity"], - custom_encoder=config["custom_encoder"], - max_length=config["max_length"], - residues=config["residues"], - max_charge=config["max_charge"], - precursor_mass_tol=config["precursor_mass_tol"], - isotope_error_range=config["isotope_error_range"], - n_beams=config["n_beams"], - top_match=config["top_match"], - n_log=config["n_log"], - tb_summarywriter=config["tb_summarywriter"], - warmup_iters=config["warmup_iters"], - max_iters=config["max_iters"], - lr=config["learning_rate"], - weight_decay=config["weight_decay"], - ) - if config["train_from_scratch"]: - model = Spec2Pep(**model_params) - else: - if not os.path.isfile(model_filename): + if train and any(from_scratch): + self.model = Spec2Pep(**model_params) + return + elif self.model_filename is None: + logger.error("A model file must be provided") + raise ValueError("A model file must be provided") + + if not Path(self.model_filename).exists(): logger.error( - "Could not find the model weights at file %s to continue " - "training", - model_filename, + "Could not find the model weights at file %s", + self.model_filename, ) - raise FileNotFoundError( - "Could not find the model weights to continue training" + raise FileNotFoundError("Could not find the model weights file") + + # First try loading model details from the weithgs file, + # otherwise use the provided configuration. + device = torch.empty(1).device # Use the default device. + try: + self.model = Spec2Pep.load_from_checkpoint( + self.model_filename, + map_location=device, ) - model = Spec2Pep().load_from_checkpoint(model_filename, **model_params) - # Create the Trainer object and (optionally) a checkpoint callback to - # periodically save the model. - if config["save_model"]: - callbacks = [ - pl.callbacks.ModelCheckpoint( - dirpath=config["model_save_folder_path"], - save_top_k=-1, - save_weights_only=config["save_weights_only"], - every_n_train_steps=config["every_n_train_steps"], + except RuntimeError: + self.model = Spec2Pep.load_from_checkpoint( + self.model_filename, + map_location=device, + **model_params, ) - ] - else: - callbacks = None - - trainer = pl.Trainer( - accelerator="auto", - auto_select_gpus=True, - callbacks=callbacks, - devices=_get_devices(config["no_gpu"]), - enable_checkpointing=config["save_model"], - logger=config["logger"], - max_epochs=config["max_epochs"], - num_sanity_val_steps=config["num_sanity_val_steps"], - strategy=_get_strategy(), - val_check_interval=config["every_n_train_steps"], - ) - # Train the model. - trainer.fit( - model, train_loader.train_dataloader(), val_loader.val_dataloader() - ) - # Clean up temporary files. - tmp_dir.cleanup() + + def initialize_data_module( + self, + train_index: Optional[AnnotatedSpectrumIndex] = None, + valid_index: Optional[AnnotatedSpectrumIndex] = None, + test_index: ( + Optional[Union[AnnotatedSpectrumIndex, SpectrumIndex]] + ) = None, + ) -> None: + """Initialize the data module + + Parameters + ---------- + train_index : AnnotatedSpectrumIndex, optional + A spectrum index for model training. + valid_index : AnnotatedSpectrumIndex, optional + A spectrum index for validation. + test_index : AnnotatedSpectrumIndex or SpectrumIndex, optional + A spectrum index for evaluation or inference. + """ + try: + n_devices = self.trainer.num_devices + train_bs = self.config.train_batch_size // n_devices + eval_bs = self.config.predict_batch_size // n_devices + except AttributeError: + raise RuntimeError("Please use `initialize_trainer()` first.") + + self.loaders = DeNovoDataModule( + train_index=train_index, + valid_index=valid_index, + test_index=test_index, + min_mz=self.config.min_mz, + max_mz=self.config.max_mz, + min_intensity=self.config.min_intensity, + remove_precursor_tol=self.config.remove_precursor_tol, + n_workers=self.config.n_workers, + train_batch_size=train_bs, + eval_batch_size=eval_bs, + ) + + def _get_index( + self, + peak_path: str, + annotated: bool, + msg: str = "", + ) -> Union[SpectrumIndex, AnnotatedSpectrumIndex]: + """Get the spectrum index. + + If the file is a SpectrumIndex, only one is allowed. Otherwise multiple + may be specified. + + Parameters + ---------- + peak_path : str + The peak file/directory to check. + annotated : bool + Are the spectra expected to be annotated? + msg : str, optional + A string to insert into the error message. + + Returns + ------- + SpectrumIndex or AnnotatedSpectrumIndex + The spectrum index for training, evaluation, or inference. + """ + ext = (".mgf", ".h5", ".hdf5") + if not annotated: + ext += (".mzml", ".mzxml") + + msg = msg.strip() + filenames = _get_peak_filenames(peak_path, ext) + if not filenames: + not_found_err = f"Cound not find {msg} peak files" + logger.error(not_found_err + " from %s", peak_path) + raise FileNotFoundError(not_found_err) + + is_index = any([Path(f).suffix in (".h5", ".hdf5") for f in filenames]) + if is_index: + if len(filenames) > 1: + h5_err = f"Multiple {msg} HDF5 spectrum indexes specified" + logger.error(h5_err) + raise ValueError(h5_err) + + index_fname, filenames = filenames[0], None + else: + index_fname = Path(self.tmp_dir.name) / f"{uuid.uuid4().hex}.hdf5" + + Index = AnnotatedSpectrumIndex if annotated else SpectrumIndex + valid_charge = np.arange(1, self.config.max_charge + 1) + return Index(index_fname, filenames, valid_charge=valid_charge) + + def _get_strategy(self) -> Optional[DDPStrategy]: + """Get the strategy for the Trainer. + + The DDP strategy works best when multiple GPUs are used. It can work + for CPU-only, but definitely fails using MPS (the Apple Silicon chip) + due to Gloo. + + Returns + ------- + Optional[DDPStrategy] + The strategy parameter for the Trainer. + + """ + if self.config.accelerator in ("cpu", "mps"): + return "auto" + + if self.config.devices == 1: + return "auto" + + if torch.cuda.device_count() > 1: + return DDPStrategy(find_unused_parameters=False, static_graph=True) + + return "auto" def _get_peak_filenames( - path: str, supported_ext: Iterable[str] = (".mgf",) + paths: Iterable[str], supported_ext: Iterable[str] ) -> List[str]: """ Get all matching peak file names from the path pattern. @@ -333,65 +388,22 @@ def _get_peak_filenames( Parameters ---------- - path : str - The path pattern. + paths : Iterable[str] + The path pattern(s). supported_ext : Iterable[str] - Extensions of supported peak file formats. Default: MGF. + Extensions of supported peak file formats. Returns ------- List[str] The peak file names matching the path pattern. """ - path = os.path.expanduser(path) - path = os.path.expandvars(path) - return [ - os.path.abspath(fn) - for fn in glob.glob(path, recursive=True) - if os.path.splitext(fn.lower())[1] in supported_ext - ] - - -def _get_strategy() -> Optional[DDPStrategy]: - """ - Get the strategy for the Trainer. - - The DDP strategy works best when multiple GPUs are used. It can work for - CPU-only, but definitely fails using MPS (the Apple Silicon chip) due to - Gloo. - - Returns - ------- - Optional[DDPStrategy] - The strategy parameter for the Trainer. - """ - if torch.cuda.device_count() > 1: - return DDPStrategy(find_unused_parameters=False, static_graph=True) - - return None - - -def _get_devices(no_gpu: bool) -> Union[int, str]: - """ - Get the number of GPUs/CPUs for the Trainer to use. - - Parameters - ---------- - no_gpu : bool - If true, disable all GPU usage. - - Returns - ------- - Union[int, str] - The number of GPUs/CPUs to use, or "auto" to let PyTorch Lightning - determine the appropriate number of devices. - """ - if not no_gpu and any( - operator.attrgetter(device + ".is_available")(torch)() - for device in ("cuda",) - ): - return -1 - elif not (n_workers := utils.n_workers()): - return "auto" - else: - return n_workers + found_files = set() + for path in paths: + path = os.path.expanduser(path) + path = os.path.expandvars(path) + for fname in glob.glob(path, recursive=True): + if Path(fname).suffix.lower() in supported_ext: + found_files.add(fname) + + return sorted(list(found_files)) diff --git a/pyproject.toml b/pyproject.toml index 1b14d1e2..5efb2953 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,13 +27,13 @@ dependencies = [ "pandas", "psutil", "PyGithub", - "pytorch-lightning>=1.7,<2.0", + "lightning>=2.0", "PyYAML", "requests", "scikit-learn", "spectrum_utils", "tensorboard", - "torch>=1.9", + "torch>=2.0", "tqdm", ] dynamic = ["version"] diff --git a/tests/conftest.py b/tests/conftest.py index 574aed8f..c137c5f6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,7 @@ import numpy as np import psims import pytest +import yaml from pyteomics.mass import calculate_mass @@ -180,3 +181,25 @@ def _create_mzml(peptides, mzml_file, random_state=42): ) return mzml_file + + +@pytest.fixture +def tiny_config(tmp_path): + """A config file for a tiny model.""" + cfg = { + "n_head": 2, + "dim_feedfoward": 10, + "n_layers": 1, + "warmup_iters": 1, + "max_iters": 1, + "max_epochs": 10, + "every_n_train_steps": 1, + "model_save_folder_path": str(tmp_path), + "accelerator": "cpu", + } + + cfg_file = tmp_path / "config.yml" + with cfg_file.open("w+") as out_file: + yaml.dump(cfg, out_file) + + return cfg_file diff --git a/tests/test_integration.py b/tests/test_integration.py index 7d63f5c1..12002dc2 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,26 +1,77 @@ +import functools + import pyteomics.mztab +from click.testing import CliRunner from casanovo import casanovo -def test_denovo(mgf_small, mzml_small, tmp_path, monkeypatch): +def test_train_and_run( + mgf_small, mzml_small, tiny_config, tmp_path, monkeypatch +): # We can use this to explicitly test different versions. monkeypatch.setattr(casanovo, "__version__", "3.0.1") - # Predict on small files (MGF and mzML) and verify that the output mzTab - # file exists. - output_filename = tmp_path / "test.mztab" - casanovo.main( - [ - "--mode", - "denovo", - "--peak_path", - str(mgf_small).replace(".mgf", ".m*"), - "--output", - str(output_filename), - ], - standalone_mode=False, + # Run a command: + run = functools.partial( + CliRunner().invoke, casanovo.main, catch_exceptions=False ) + + # Train a tiny model: + train_args = [ + "--mode", + "train", + "--peak_path", + mgf_small, + "--peak_path_val", + mgf_small, + "--config", + tiny_config, + "--output", + str(tmp_path / "train"), + ] + + result = run(train_args) + model_file = tmp_path / "epoch=9-step=10.ckpt" + assert result.exit_code == 0 + assert model_file.exists() + + # Try evaluating: + eval_args = [ + "--mode", + "eval", + "--peak_path", + mgf_small, + "--model", + model_file, + "--config", + tiny_config, + "--output", + str(tmp_path / "eval"), + ] + + result = run(eval_args) + assert result.exit_code == 0 + + # Finally try predicting: + output_filename = tmp_path / "test.mztab" + predict_args = [ + "--mode", + "denovo", + "--peak_path", + mgf_small, + "--peak_path", + mzml_small, + "--model", + model_file, + "--config", + tiny_config, + "--output", + str(output_filename), + ] + + result = run(predict_args) + assert result.exit_code == 0 assert output_filename.is_file() mztab = pyteomics.mztab.MzTab(str(output_filename)) @@ -29,7 +80,8 @@ def test_denovo(mgf_small, mzml_small, tmp_path, monkeypatch): assert f"ms_run[{i}]-location" in mztab.metadata assert mztab.metadata[f"ms_run[{i}]-location"].endswith(filename) - # Verify that the spectrum predictions are correct and indexed according to + # Verify that the spectrum predictions are correct + # and indexed according to # the peak input file type. psms = mztab.spectrum_match_table assert psms.loc[1, "sequence"] == "LESLLEK" diff --git a/tests/unit_tests/test_config.py b/tests/unit_tests/test_config.py index 8282e367..8da26f8c 100644 --- a/tests/unit_tests/test_config.py +++ b/tests/unit_tests/test_config.py @@ -9,7 +9,7 @@ def test_default(): config = Config() assert config.random_seed == 454 assert config["random_seed"] == 454 - assert not config.no_gpu + assert config.accelerator == "auto" assert config.file == "default" @@ -22,6 +22,6 @@ def test_override(tmp_path): config = Config(yml) assert config.random_seed == 42 assert config["random_seed"] == 42 - assert not config.no_gpu + assert config.accelerator == "auto" assert config.top_match == 3 assert config.file == str(yml) diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py new file mode 100644 index 00000000..6ef2b250 --- /dev/null +++ b/tests/unit_tests/test_runner.py @@ -0,0 +1,94 @@ +"""Unit tests specifically for the model_runner module.""" +from typing import Union, Any, Dict + +import lightning.pytorch as pl +import pytest +import torch +from lightning.pytorch.accelerators import Accelerator + +from casanovo.config import Config +from casanovo.denovo.model_runner import ModelRunner + + +def test_initialize_model(tmp_path): + """Test that""" + config = Config() + config.train_from_scratch = False + ModelRunner(config=config).initialize_model(train=True) + + with pytest.raises(ValueError): + ModelRunner(config=config).initialize_model(train=False) + + with pytest.raises(FileNotFoundError): + runner = ModelRunner(config=config, model_filename="blah") + runner.initialize_model(train=True) + + with pytest.raises(FileNotFoundError): + runner = ModelRunner(config=config, model_filename="blah") + runner.initialize_model(train=False) + + # This should work now: + config.train_from_scratch = True + runner = ModelRunner(config=config, model_filename="blah") + runner.initialize_model(train=True) + + # But this should still fail: + with pytest.raises(FileNotFoundError): + runner = ModelRunner(config=config, model_filename="blah") + runner.initialize_model(train=False) + + # If the model initialization throws and EOFError, then the Spec2Pep model + # has tried to load the weights: + weights = tmp_path / "blah" + weights.touch() + with pytest.raises(EOFError): + runner = ModelRunner(config=config, model_filename=str(weights)) + runner.initialize_model(train=False) + + +def test_save_and_load_weights(tmp_path, mgf_small, tiny_config): + """Test saving aloading weights""" + config = Config(tiny_config) + config.max_epochs = 1 + config.n_layers = 1 + ckpt = tmp_path / "test.ckpt" + + with ModelRunner(config=config) as runner: + runner.train([mgf_small], [mgf_small]) + runner.trainer.save_checkpoint(ckpt) + + # Try changing model arch: + other_config = Config(tiny_config) + other_config.n_layers = 50 # lol + with torch.device("meta"): + # Now load the weights into a new model + # The device should be meta for all the weights. + runner = ModelRunner(config=other_config, model_filename=ckpt) + runner.initialize_model(train=False) + + obs_layers = runner.model.encoder.transformer_encoder.num_layers + assert obs_layers == 1 # Match the original arch. + assert next(runner.model.parameters()).device == torch.device("meta") + + # If the Trainer correctly moves the weights to the accelerator, + # then it should fail if the weights are on the "meta" device. + with torch.device("meta"): + with ModelRunner(other_config, model_filename=ckpt) as runner: + with pytest.raises(NotImplementedError) as err: + runner.evaluate([mgf_small]) + + assert "meta tensor; no data!" in str(err.value) + + # Try without arch: + ckpt_data = torch.load(ckpt) + del ckpt_data["hyper_parameters"] + torch.save(ckpt_data, ckpt) + + # Shouldn't work: + with ModelRunner(other_config, model_filename=ckpt) as runner: + with pytest.raises(RuntimeError): + runner.evaluate([mgf_small]) + + # Should work: + with ModelRunner(config=config, model_filename=ckpt) as runner: + runner.evaluate([mgf_small]) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index bc0509bd..8fd0689b 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -69,6 +69,7 @@ def test_split_version(): assert version == ("3", "0", "1") +@pytest.mark.skip(reason="Hit rate limit during CI/CD") def test_get_model_weights(monkeypatch): """ Test that model weights can be downloaded from GitHub or used from the From ea693600d8d84be90f4e79fe545798bc9e056e83 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Fri, 23 Jun 2023 08:11:44 +0200 Subject: [PATCH 08/30] Only create log directory when checkpointing is enabled (#196) Checkpointing during training can be enabled using the `logger` config option. If this is not specified, or during inference, don't create the `lightning_logs/` directory. Fixes #187. --- casanovo/config.py | 1 - casanovo/denovo/model_runner.py | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/casanovo/config.py b/casanovo/config.py index fbbf2e16..b32ef07c 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -4,7 +4,6 @@ from typing import Optional, Dict, Callable, Tuple, Union import yaml -import torch from . import utils diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 2c22bd62..cac94123 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -171,10 +171,12 @@ def initialize_trainer(self, train: bool) -> None: Determines whether to set the trainer up for model training or evaluation / inference. """ + logger = self.config.logger if self.config.logger is not None else False trainer_cfg = dict( accelerator=self.config.accelerator, devices=1, - logger=self.config.logger, + enable_checkpointing=False, + logger=logger, ) if train: From f74a6ff741b682bcc270c00b38ae9dd3d576d10b Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 27 Jun 2023 08:23:28 +0200 Subject: [PATCH 09/30] Avoid uninformative warnings (#175) * Avoid detailed fsspec logging * Replace deprecated `auto_select_gpus` * Show nice dataloader workers log message Capture the Pytorch Lightning warning as per https://lightning.ai/docs/pytorch/stable/advanced/speed.html#dataloaders * Overhaul runner * Update linting to only happen once * Fix linting error * Specify utf-8 encoding * Specify utf-8 encoding only for default config * Skip weights tests for now * Update skipping API test * Revert accidental max_epochs change * msg -> reason for pytest.mark.skip * Remove obsolete imports * Fix type hints * Wout's suggestions and more tests * Remove encoding * Ignore irrelevant PyTorch warnings * Update changelog * Remove unused imports * Undo unnecessary whitespace change --------- Co-authored-by: William Fondrie --- CHANGELOG.md | 1 + casanovo/casanovo.py | 16 +++++++++++++--- casanovo/data/ms_io.py | 7 ++++--- casanovo/denovo/model.py | 1 - casanovo/denovo/model_runner.py | 24 ++++++++++-------------- casanovo/utils.py | 8 ++++++++ tests/test_integration.py | 3 +-- tests/unit_tests/test_runner.py | 12 ++++-------- 8 files changed, 41 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c87c3d8..ac65c309 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - We now log steps rather than epochs as units of progress during training. - Validation performance metrics are logged (and added to tensorboard) at the validation epoch, and training loss is logged at the end of training epoch, i.e. training and validation metrics are logged asynchronously. +- Irrelevant warning messages on the console output and in the log file are no longer shown. ### Fixed diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index f07d7b43..634cf078 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -10,19 +10,28 @@ from typing import Optional, Tuple warnings.filterwarnings("ignore", category=DeprecationWarning) +warnings.filterwarnings( + "ignore", + ".*Consider increasing the value of the `num_workers` argument*", +) +warnings.filterwarnings( + "ignore", + ".*The PyTorch API of nested tensors is in prototype stage*", +) +warnings.filterwarnings( + "ignore", + ".*Converting mask without torch.bool dtype to bool*", +) import appdirs import click import github import requests -import torch import tqdm -import yaml from lightning.pytorch import seed_everything from . import __version__ from . import utils -from .data import ms_io from .denovo import ModelRunner from .config import Config @@ -119,6 +128,7 @@ def main( root.addHandler(file_handler) # Disable dependency non-critical log messages. logging.getLogger("depthcharge").setLevel(logging.INFO) + logging.getLogger("fsspec").setLevel(logging.WARNING) logging.getLogger("github").setLevel(logging.WARNING) logging.getLogger("h5py").setLevel(logging.WARNING) logging.getLogger("numba").setLevel(logging.WARNING) diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index e3b3a8d6..47d99700 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -5,11 +5,12 @@ import os import re from pathlib import Path -from typing import Any, Dict, List +from typing import List import natsort from .. import __version__ +from ..config import Config class MztabWriter: @@ -42,13 +43,13 @@ def __init__(self, filename: str): self._run_map = {} self.psms = [] - def set_metadata(self, config: Dict[str, Any], **kwargs) -> None: + def set_metadata(self, config: Config, **kwargs) -> None: """ Specify metadata information to write to the mzTab header. Parameters ---------- - config : Dict[str, Any] + config : Config The active configuration options. kwargs Additional configuration options (i.e. from command-line arguments). diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 1105a9e7..516433bc 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1,7 +1,6 @@ """A de novo peptide sequencing model.""" import collections import heapq -import itertools import logging from typing import Any, Dict, Iterable, List, Optional, Tuple, Union diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index cac94123..a77f39b6 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -2,12 +2,11 @@ model.""" import glob import logging -import operator import os import tempfile import uuid from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Union +from typing import Iterable, List, Optional, Union import lightning.pytorch as pl import numpy as np @@ -15,7 +14,6 @@ from depthcharge.data import AnnotatedSpectrumIndex, SpectrumIndex from lightning.pytorch.strategies import DDPStrategy -from .. import utils from ..config import Config from ..data import ms_io from ..denovo.dataloaders import DeNovoDataModule @@ -305,7 +303,7 @@ def initialize_data_module( def _get_index( self, - peak_path: str, + peak_path: Iterable[str], annotated: bool, msg: str = "", ) -> Union[SpectrumIndex, AnnotatedSpectrumIndex]: @@ -316,8 +314,8 @@ def _get_index( Parameters ---------- - peak_path : str - The peak file/directory to check. + peak_path : Iterable[str] + The peak files/directories to check. annotated : bool Are the spectra expected to be annotated? msg : str, optional @@ -354,7 +352,7 @@ def _get_index( valid_charge = np.arange(1, self.config.max_charge + 1) return Index(index_fname, filenames, valid_charge=valid_charge) - def _get_strategy(self) -> Optional[DDPStrategy]: + def _get_strategy(self) -> Union[str, DDPStrategy]: """Get the strategy for the Trainer. The DDP strategy works best when multiple GPUs are used. It can work @@ -363,20 +361,18 @@ def _get_strategy(self) -> Optional[DDPStrategy]: Returns ------- - Optional[DDPStrategy] + Union[str, DDPStrategy] The strategy parameter for the Trainer. """ if self.config.accelerator in ("cpu", "mps"): return "auto" - - if self.config.devices == 1: + elif self.config.devices == 1: return "auto" - - if torch.cuda.device_count() > 1: + elif torch.cuda.device_count() > 1: return DDPStrategy(find_unused_parameters=False, static_graph=True) - - return "auto" + else: + return "auto" def _get_peak_filenames( diff --git a/casanovo/utils.py b/casanovo/utils.py index cca67747..b497ac12 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -1,4 +1,5 @@ """Small utility functions""" +import logging import os import platform import re @@ -8,6 +9,9 @@ import torch +logger = logging.getLogger("casanovo") + + def n_workers() -> int: """ Get the number of workers to use for data loading. @@ -26,6 +30,10 @@ def n_workers() -> int: """ # Windows or MacOS: no multiprocessing. if platform.system() in ["Windows", "Darwin"]: + logger.warning( + "Dataloader multiprocessing is currently not supported on Windows " + "or MacOS; using only a single thread." + ) return 0 # Linux: scale the number of workers by the number of GPUs (if present). try: diff --git a/tests/test_integration.py b/tests/test_integration.py index 12002dc2..1f8f159b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -81,8 +81,7 @@ def test_train_and_run( assert mztab.metadata[f"ms_run[{i}]-location"].endswith(filename) # Verify that the spectrum predictions are correct - # and indexed according to - # the peak input file type. + # and indexed according to the peak input file type. psms = mztab.spectrum_match_table assert psms.loc[1, "sequence"] == "LESLLEK" assert psms.loc[1, "spectra_ref"] == "ms_run[1]:index=0" diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index 6ef2b250..9d9f2497 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -1,10 +1,6 @@ """Unit tests specifically for the model_runner module.""" -from typing import Union, Any, Dict - -import lightning.pytorch as pl import pytest import torch -from lightning.pytorch.accelerators import Accelerator from casanovo.config import Config from casanovo.denovo.model_runner import ModelRunner @@ -63,7 +59,7 @@ def test_save_and_load_weights(tmp_path, mgf_small, tiny_config): with torch.device("meta"): # Now load the weights into a new model # The device should be meta for all the weights. - runner = ModelRunner(config=other_config, model_filename=ckpt) + runner = ModelRunner(config=other_config, model_filename=str(ckpt)) runner.initialize_model(train=False) obs_layers = runner.model.encoder.transformer_encoder.num_layers @@ -73,7 +69,7 @@ def test_save_and_load_weights(tmp_path, mgf_small, tiny_config): # If the Trainer correctly moves the weights to the accelerator, # then it should fail if the weights are on the "meta" device. with torch.device("meta"): - with ModelRunner(other_config, model_filename=ckpt) as runner: + with ModelRunner(other_config, model_filename=str(ckpt)) as runner: with pytest.raises(NotImplementedError) as err: runner.evaluate([mgf_small]) @@ -85,10 +81,10 @@ def test_save_and_load_weights(tmp_path, mgf_small, tiny_config): torch.save(ckpt_data, ckpt) # Shouldn't work: - with ModelRunner(other_config, model_filename=ckpt) as runner: + with ModelRunner(other_config, model_filename=str(ckpt)) as runner: with pytest.raises(RuntimeError): runner.evaluate([mgf_small]) # Should work: - with ModelRunner(config=config, model_filename=ckpt) as runner: + with ModelRunner(config=config, model_filename=str(ckpt)) as runner: runner.evaluate([mgf_small]) From b271dffc07302e9195da178da2f025c1d037296a Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 27 Jun 2023 10:01:32 +0200 Subject: [PATCH 10/30] Use a single beam by default (#195) * Ensure correct output with n_beams=1 Fixes #185. * Use a single beam by default Fixes #193. * Add unit test for single beam --------- Co-authored-by: melihyilmaz --- casanovo/config.yaml | 2 +- casanovo/denovo/model.py | 7 +++++-- tests/unit_tests/test_unit.py | 31 ++++++++++++++++++++++++++++++- 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 7b8379ab..dc181c55 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -100,7 +100,7 @@ train_batch_size: 32 # Number of spectra in one inference batch (I) predict_batch_size: 1024 # Number of beams used in beam search (I) -n_beams: 5 +n_beams: 1 # Number of PSMs for each spectrum (I) top_match: 1 # Object for logging training progress diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 516433bc..ccc95756 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -101,7 +101,7 @@ def __init__( precursor_mass_tol: float = 50, isotope_error_range: Tuple[int, int] = (0, 1), min_peptide_len: int = 6, - n_beams: int = 5, + n_beams: int = 1, top_match: int = 1, n_log: int = 10, tb_summarywriter: Optional[ @@ -604,9 +604,12 @@ def _get_topk_beams( # Mask out terminated beams. Include precursor m/z tolerance induced # termination. + # TODO: `clone()` is necessary to get the correct output with n_beams=1. + # An alternative implementation using base PyTorch instead of einops + # might be more efficient. finished_mask = einops.repeat( finished_beams, "(B S) -> B (V S)", S=beam, V=vocab - ) + ).clone() # Mask out the index '0', i.e. padding token, by default. finished_mask[:, :beam] = True diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 8fd0689b..97898ae5 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -341,7 +341,36 @@ def test_beam_search_decode(): ) assert torch.equal(pred_cache[0][0][-1], torch.tensor([4, 14, 4, 13])) - + + # Test _get_topk_beams(). + step=1 + scores = torch.full( + size=(batch, length, vocab, beam), fill_value=torch.nan + ) + scores = einops.rearrange(scores, "B L V S -> (B S) L V") + tokens = torch.zeros(batch * beam, length, dtype=torch.int64) + tokens[0,0] = 4 + scores[0, step, :] = 0 + scores[0, step, 14] = torch.tensor([1]) + test_finished_beams = torch.tensor([False]) + + new_tokens, new_scores = model._get_topk_beams( + tokens, scores, test_finished_beams, batch, step + ) + + expected_tokens = torch.tensor( + [ + [4, 14], + ] + ) + + expected_scores = torch.zeros(beam, vocab) + expected_scores[:, 14] = torch.tensor([1]) + + assert torch.equal(new_scores[:, step, :], expected_scores) + assert torch.equal(new_tokens[:, : step + 1], expected_tokens) + + # Test _finish_beams() for tokens with a negative mass. model = Spec2Pep(n_beams=2, residues="massivekb") beam = model.n_beams # S From 621ebdc627a8afc8cac6beb202fab2e4ec999ff4 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 27 Jun 2023 18:16:02 +0200 Subject: [PATCH 11/30] Run linting and tests for the dev branch (#197) * Run linting and tests for dev branch * Fix tests Limit DepthCharge to v0.2. * No DepthCharge v0.3 --- .github/workflows/lint.yml | 8 ++++++-- .github/workflows/tests.yml | 8 ++++++-- pyproject.toml | 2 +- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ce576f53..fb937494 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -2,9 +2,13 @@ name: Lint on: push: - branches: [ main ] + branches: + - main + - dev pull_request: - branches: [ main ] + branches: + - main + - dev jobs: lint: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ea5a1eb8..08001ed5 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -5,9 +5,13 @@ name: tests on: push: - branches: [ main ] + branches: + - main + - dev pull_request: - branches: [ main ] + branches: + - main + - dev jobs: build: diff --git a/pyproject.toml b/pyproject.toml index 5efb2953..629c0d29 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ requires-python = ">=3.8" dependencies = [ "appdirs", "click", - "depthcharge-ms>=0.2.0", + "depthcharge-ms>=0.2.0,<0.3.0", "natsort", "numpy", "pandas", From ad48a095f6e7eeee9883f790bfd50b233895102e Mon Sep 17 00:00:00 2001 From: Will Fondrie Date: Tue, 18 Jul 2023 12:07:22 -0700 Subject: [PATCH 12/30] CLI revamp (#184) * Overhaul runner * Update linting to only happen once * Fix linting error * Specify utf-8 encoding * Specify utf-8 encoding only for default config * Skip weights tests for now * Update skipping API test * Revert accidental max_epochs change * msg -> reason for pytest.mark.skip * Wout's suggestions and more tests * Remove encoding * Specify device type when weight loading * Fix lint * Capture init params and figure out device automagically * Add runner tests * Fix bug and limit saved models * WIP * Support old weights too * Remove every_n_train_steps from checkpoint * Implementation done. Need to update tests * Update tests * add -h * Add auto screenshot generation * Add screenshots workflow [screenshots] * Add manual dispatch * Added image * Updated instructions [screenshots] * Update screenshot workflow * Fix bug and screenshot uploads * Restirct workflow * fix ref * Generate new screengrabs with rich-codex * increase timeout * bump changelog and increase timeout * Generate new screengrabs with rich-codex * Add more tests * Add option for calculating precision during training * Restrict depthcharge version * Log scalars instead of tensors * Fix typo * Fix issue Wout found * Write to stdout instead of stderr * Minor refactoring * Separate logger and model initialization * Generate new screengrabs with rich-codex * Generate new screengrabs with rich-codex * Fix test formatting * Fix edge case --------- Co-authored-by: melihyilmaz Co-authored-by: github-actions[bot] Co-authored-by: Wout Bittremieux --- .github/workflows/screenshots.yml | 29 +++ CHANGELOG.md | 13 ++ casanovo/casanovo.py | 334 +++++++++++++++++++++++------- casanovo/config.py | 13 ++ casanovo/config.yaml | 3 + casanovo/denovo/evaluate.py | 2 +- casanovo/denovo/model.py | 64 ++++-- casanovo/denovo/model_runner.py | 10 +- docs/getting_started.md | 58 +++--- docs/images/configure-help.svg | 107 ++++++++++ docs/images/evaluate-help.svg | 166 +++++++++++++++ docs/images/help.svg | 200 ++++++++++++++++++ docs/images/sequence-help.svg | 166 +++++++++++++++ docs/images/train-help.svg | 218 +++++++++++++++++++ pyproject.toml | 3 +- tests/conftest.py | 2 +- tests/test_integration.py | 49 +++-- tests/unit_tests/test_runner.py | 24 +++ tests/unit_tests/test_unit.py | 27 ++- 19 files changed, 1332 insertions(+), 156 deletions(-) create mode 100644 .github/workflows/screenshots.yml create mode 100644 docs/images/configure-help.svg create mode 100644 docs/images/evaluate-help.svg create mode 100644 docs/images/help.svg create mode 100644 docs/images/sequence-help.svg create mode 100644 docs/images/train-help.svg diff --git a/.github/workflows/screenshots.yml b/.github/workflows/screenshots.yml new file mode 100644 index 00000000..3b646efa --- /dev/null +++ b/.github/workflows/screenshots.yml @@ -0,0 +1,29 @@ +name: Screenshots with rich-codex +on: + pull_request: + paths: + - "docs/*.md" + - "casanovo/casanovo.py" + workflow_dispatch: + +jobs: + rich_codex: + runs-on: ubuntu-latest + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + + - name: Set up Python + uses: actions/setup-python@v3 + + - name: Install your custom tools + run: pip install . + + - name: Generate terminal images with rich-codex + uses: ewels/rich-codex@v1 + with: + timeout: 10 + commit_changes: "true" + clean_img_paths: docs/images/*.svg diff --git a/CHANGELOG.md b/CHANGELOG.md index ac65c309..ddf1891d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,8 +6,21 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] +### Changed + +- The CLI has been overhauled to use subcommands. +- Upgraded to Lightning >=2.0 +- Checkpointing is now configured to save the top-k models instead of all. + +### Fixed + +- Casanovo now runs on CPU and can passes all tests. + ### Added +- Checkpoints now include model parameters, allowing for mismatches with the provided configuration file. +- `accelerator` parameter now controls the accelerator (CPU, GPU, etc) that is used. +- `devices` parameter controls the number of accelerators used. - `every_n_train_steps` parameter now controls the frequency of both validation epochs and model checkpointing during training. ### Changed diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 634cf078..5c2a8bdc 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -7,6 +7,7 @@ import shutil import sys import warnings +from pathlib import Path from typing import Optional, Tuple warnings.filterwarnings("ignore", category=DeprecationWarning) @@ -24,9 +25,12 @@ ) import appdirs -import click +import depthcharge import github +import lightning import requests +import rich_click as click +import torch import tqdm from lightning.pytorch import seed_everything @@ -36,96 +40,268 @@ from .config import Config logger = logging.getLogger("casanovo") +click.rich_click.USE_MARKDOWN = True +click.rich_click.STYLE_HELPTEXT = "" +click.rich_click.SHOW_ARGUMENTS = True -@click.command() -@click.option( - "--mode", +class _SharedParams(click.RichCommand): + """Options shared between most Casanovo commands""" + + def __init__(self, *args, **kwargs) -> None: + """Define shared options.""" + super().__init__(*args, **kwargs) + self.params += [ + click.Option( + ("-m", "--model"), + help=""" + The model weights (.ckpt file). If not provided, Casanovo + will try to download the latest release. + """, + type=click.Path(exists=True, dir_okay=False), + ), + click.Option( + ("-o", "--output"), + help="The mzTab file to which results will be written.", + type=click.Path(dir_okay=False), + ), + click.Option( + ("-c", "--config"), + help=""" + The YAML configuration file overriding the default options. + """, + type=click.Path(exists=True, dir_okay=False), + ), + click.Option( + ("-v", "--verbosity"), + help=""" + Set the verbosity of console logging messages. Log files are + always set to 'debug'. + """, + type=click.Choice( + ["debug", "info", "warning", "error"], + case_sensitive=False, + ), + default="info", + ), + ] + + +@click.group(context_settings=dict(help_option_names=["-h", "--help"])) +def main() -> None: + """# Casanovo + + Casanovo de novo sequences peptides from tandem mass spectra using a + Transformer model. Casanovo currently supports mzML, mzXML, and MGF files + for de novo sequencing and annotated MGF files, such as those from + MassIVE-KB, for training new models. + + Links: + - Documentation: [https://casanovo.readthedocs.io]() + - Official code repository: [https://github.com/Noble-Lab/casanovo]() + + If you use Casanovo in your work, please cite: + - Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo + mass spectrometry peptide sequencing with a transformer model. Proceedings + of the 39th International Conference on Machine Learning - ICML '22 (2022) + doi:10.1101/2022.02.07.479481. + + """ + return + + +@main.command(cls=_SharedParams) +@click.argument( + "peak_path", required=True, - default="denovo", - help="\b\nThe mode in which to run Casanovo:\n" - '- "denovo" will predict peptide sequences for\nunknown MS/MS spectra.\n' - '- "train" will train a model (from scratch or by\ncontinuing training a ' - "previously trained model).\n" - '- "eval" will evaluate the performance of a\ntrained model using ' - "previously acquired spectrum\nannotations.", - type=click.Choice(["denovo", "train", "eval"]), + nargs=-1, + type=click.Path(exists=True, dir_okay=False), ) -@click.option( - "--model", - help="The file name of the model weights (.ckpt file).", +def sequence( + peak_path: Tuple[str], + model: Optional[str], + config: Optional[str], + output: Optional[str], + verbosity: str, +) -> None: + """De novo sequence peptides from tandem mass spectra. + + PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which + to sequence peptides. + """ + output = setup_logging(output, verbosity) + config = setup_model(model, config, output, False) + with ModelRunner(config, model) as runner: + logger.info("Sequencing peptides from:") + for peak_file in peak_path: + logger.info(" %s", peak_file) + + runner.predict(peak_path, output) + + logger.info("DONE!") + + +@main.command(cls=_SharedParams) +@click.argument( + "annotated_peak_path", + required=True, + nargs=-1, type=click.Path(exists=True, dir_okay=False), ) -@click.option( - "--peak_path", +def evaluate( + annotated_peak_path: Tuple[str], + model: Optional[str], + config: Optional[str], + output: Optional[str], + verbosity: str, +) -> None: + """Evaluate de novo peptide sequencing performance. + + ANNOTATED_PEAK_PATH must be one or more annoated MGF files, + such as those provided by MassIVE-KB. + """ + output = setup_logging(output, verbosity) + config = setup_model(model, config, output, False) + with ModelRunner(config, model) as runner: + logger.info("Sequencing and evaluating peptides from:") + for peak_file in annotated_peak_path: + logger.info(" %s", peak_file) + + runner.evaluate(annotated_peak_path) + + logger.info("DONE!") + + +@main.command(cls=_SharedParams) +@click.argument( + "train_peak_path", required=True, - help="The file path with peak files for predicting peptide sequences or " - "training Casanovo.", - multiple=True, + nargs=-1, + type=click.Path(exists=True, dir_okay=False), ) @click.option( - "--peak_path_val", - help="The file path with peak files to be used as validation data during " - "training.", + "-p", + "--validation_peak_path", + help=""" + An annotated MGF file for validation, like from MassIVE-KB. Use this + option multiple times to specify multiple files. + """, + required=True, multiple=True, -) -@click.option( - "--config", - help="The file name of the configuration file with custom options. If not " - "specified, a default configuration will be used.", type=click.Path(exists=True, dir_okay=False), ) +def train( + train_peak_path: Tuple[str], + validation_peak_path: Tuple[str], + model: Optional[str], + config: Optional[str], + output: Optional[str], + verbosity: str, +) -> None: + """Train a Casanovo model on your own data. + + TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those + provided by MassIVE-KB, from which to train a new Casnovo model. + """ + output = setup_logging(output, verbosity) + config = setup_model(model, config, output, True) + with ModelRunner(config, model) as runner: + logger.info("Training a model from:") + for peak_file in train_peak_path: + logger.info(" %s", peak_file) + + logger.info("Using the following validation files:") + for peak_file in validation_peak_path: + logger.info(" %s", peak_file) + + runner.train(train_peak_path, validation_peak_path) + + logger.info("DONE!") + + +@main.command() +def version() -> None: + """Get the Casanovo version information""" + versions = [ + f"Casanovo: {__version__}", + f"Depthcharge: {depthcharge.__version__}", + f"Lightning: {lightning.__version__}", + f"PyTorch: {torch.__version__}", + ] + sys.stdout.write("\n".join(versions) + "\n") + + +@main.command() @click.option( + "-o", "--output", - help="The base output file name to store logging (extension: .log) and " - "(optionally) prediction results (extension: .mztab).", + help="The output configuration file.", + default="casanovo.yaml", type=click.Path(dir_okay=False), ) -def main( - mode: str, - model: Optional[str], - peak_path: str, - peak_path_val: Optional[str], - config: Optional[str], - output: Optional[str], -): +def configure(output: str) -> None: + """Generate a Casanovo configuration file to customize. + + The casanovo configuration file is in the YAML format. """ - \b - Casanovo: De novo mass spectrometry peptide sequencing with a transformer model. - ================================================================================ + Config.copy_default(output) + output = setup_logging(output, "info") + logger.info(f"Wrote {output}\n") - Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo - mass spectrometry peptide sequencing with a transformer model. Proceedings - of the 39th International Conference on Machine Learning - ICML '22 (2022) - doi:10.1101/2022.02.07.479481. - Official code website: https://github.com/Noble-Lab/casanovo +def setup_logging( + output: Optional[str], + verbosity: str, +) -> Path: + """Set up the logger. + + Logging occurs to the command-line and to the given log file. + + Parameters + ---------- + output : Optional[str] + The provided output file name. + verbosity : str + The logging level to use in the console. + + Return + ------ + output : Path + The output file path. """ if output is None: - output = os.path.join( - os.getcwd(), - f"casanovo_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}", - ) - else: - basename, ext = os.path.splitext(os.path.abspath(output)) - output = basename if ext.lower() in (".log", ".mztab") else output + output = f"casanovo_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}" + + output = Path(output).expanduser().resolve() + + logging_levels = { + "debug": logging.DEBUG, + "info": logging.INFO, + "warning": logging.WARNING, + "error": logging.ERROR, + } # Configure logging. logging.captureWarnings(True) root = logging.getLogger() root.setLevel(logging.DEBUG) + + # Formatters for file vs console: + console_formatter = logging.Formatter("{levelname}: {message}", style="{") log_formatter = logging.Formatter( "{asctime} {levelname} [{name}/{processName}] {module}.{funcName} : " "{message}", style="{", ) + console_handler = logging.StreamHandler(sys.stderr) - console_handler.setLevel(logging.DEBUG) - console_handler.setFormatter(log_formatter) + console_handler.setLevel(logging_levels[verbosity.lower()]) + console_handler.setFormatter(console_formatter) root.addHandler(console_handler) - file_handler = logging.FileHandler(f"{output}.log") + file_handler = logging.FileHandler(output.with_suffix(".log")) file_handler.setFormatter(log_formatter) root.addHandler(file_handler) + # Disable dependency non-critical log messages. logging.getLogger("depthcharge").setLevel(logging.INFO) logging.getLogger("fsspec").setLevel(logging.WARNING) @@ -136,13 +312,40 @@ def main( logging.getLogger("torch").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) + return output + + +def setup_model( + model: Optional[str], + config: Optional[str], + output: Optional[Path], + is_train: bool, +) -> Config: + """Setup Casanovo for most commands. + + Parameters + ---------- + model : Optional[str] + The provided model weights file. + config : Optional[str] + The provided configuration file. + output : Optional[Path] + The provided output file name. + is_train : bool + Are we training? If not, we need to retrieve weights when the model is + None. + + Return + ------ + config : Config + The parsed configuration + """ # Read parameters from the config file. config = Config(config) - seed_everything(seed=config["random_seed"], workers=True) # Download model weights if these were not specified (except when training). - if model is None and mode != "train": + if model is None and not is_train: try: model = _get_model_weights() except github.RateLimitExceededException: @@ -161,26 +364,13 @@ def main( # Log the active configuration. logger.info("Casanovo version %s", str(__version__)) - logger.debug("mode = %s", mode) logger.debug("model = %s", model) - logger.debug("peak_path = %s", peak_path) - logger.debug("peak_path_val = %s", peak_path_val) logger.debug("config = %s", config.file) logger.debug("output = %s", output) for key, value in config.items(): logger.debug("%s = %s", str(key), str(value)) - # Run Casanovo in the specified mode. - with ModelRunner(config, model) as model_runner: - if mode == "denovo": - logger.info("Predict peptide sequences with Casanovo.") - model_runner.predict(peak_path, output) - elif mode == "eval": - logger.info("Evaluate a trained Casanovo model.") - model_runner.evaluate(peak_path) - elif mode == "train": - logger.info("Train the Casanovo model.") - model_runner.train(peak_path, peak_path_val) + return config def _get_model_weights() -> str: diff --git a/casanovo/config.py b/casanovo/config.py index b32ef07c..7aec89bc 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -1,5 +1,6 @@ """Parse the YAML configuration.""" import logging +import shutil from pathlib import Path from typing import Optional, Dict, Callable, Tuple, Union @@ -67,6 +68,7 @@ class Config: every_n_train_steps=int, accelerator=str, devices=int, + calculate_precision=bool, ) def __init__(self, config_file: Optional[str] = None): @@ -125,3 +127,14 @@ def validate_param(self, param: str, param_type: Callable): def items(self) -> Tuple[str, ...]: """Return the parameters""" return self._params.items() + + @classmethod + def copy_default(cls, output: str) -> None: + """Copy the default YAML configuration. + + Parameters + ---------- + output : str + The output file. + """ + shutil.copyfile(cls._default_config, output) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index dc181c55..46fa63ea 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -118,6 +118,9 @@ save_top_k: 5 model_save_folder_path: "" # Model validation and checkpointing frequency in training steps every_n_train_steps: 50_000 +# Calculate peptide and amino acid precision during training. this +# is expensive, so we recommend against it. +calculate_precision: False # The hardware accelerator to use. Must be one of: # "cpu", "gpu", "tpu", "ipu", "hpu", "mps", or "auto" accelerator: "auto" diff --git a/casanovo/denovo/evaluate.py b/casanovo/denovo/evaluate.py index 25bb9984..75ac4b6a 100644 --- a/casanovo/denovo/evaluate.py +++ b/casanovo/denovo/evaluate.py @@ -278,7 +278,7 @@ def aa_match_metrics( pep_precision = sum([aa_matches[1] for aa_matches in aa_matches_batch]) / ( len(aa_matches_batch) + 1e-8 ) - return aa_precision, aa_recall, pep_precision + return float(aa_precision), float(aa_recall), float(pep_precision) def aa_precision_recall( diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index ccc95756..1d609ad7 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -82,6 +82,9 @@ class Spec2Pep(pl.LightningModule, ModelMixin): The total number of iterations for the learning rate scheduler. out_writer: Optional[str] The output writer for the prediction results. + calculate_precision: bool + Calculate the validation set precision during training. + This is expensive. **kwargs : Dict Additional keyword arguments passed to the Adam optimizer. """ @@ -110,6 +113,7 @@ def __init__( warmup_iters: int = 100_000, max_iters: int = 600_000, out_writer: Optional[ms_io.MztabWriter] = None, + calculate_precision: bool = False, **kwargs: Dict, ): super().__init__() @@ -157,6 +161,7 @@ def __init__( self.stop_token = self.decoder._aa2idx["$"] # Logging. + self.calculate_precision = calculate_precision self.n_log = n_log self._history = [] if tb_summarywriter is not None: @@ -754,6 +759,8 @@ def validation_step( """ # Record the loss. loss = self.training_step(batch, mode="valid") + if not self.calculate_precision: + return loss # Calculate and log amino acid and peptide match evaluation metrics from # the predicted peptides. @@ -761,9 +768,12 @@ def validation_step( for spectrum_preds in self.forward(batch[0], batch[1]): for _, _, pred in spectrum_preds: peptides_pred.append(pred) + aa_precision, _, pep_precision = evaluate.aa_match_metrics( *evaluate.aa_match_batch( - peptides_pred, peptides_true, self.decoder._peptide_mass.masses + peptides_pred, + peptides_true, + self.decoder._peptide_mass.masses, ) ) log_args = dict(on_step=False, on_epoch=True, sync_dist=True) @@ -772,8 +782,11 @@ def validation_step( pep_precision, **log_args, ) - self.log("AA precision at coverage=1", aa_precision, **log_args) - + self.log( + "AA precision at coverage=1", + aa_precision, + **log_args, + ) return loss def predict_step( @@ -828,7 +841,7 @@ def on_train_epoch_end(self) -> None: train_loss = self.trainer.callback_metrics["train_CELoss"].detach() metrics = { "step": self.trainer.global_step, - "train": train_loss, + "train": train_loss.item(), } self._history.append(metrics) self._log_history() @@ -840,14 +853,18 @@ def on_validation_epoch_end(self) -> None: callback_metrics = self.trainer.callback_metrics metrics = { "step": self.trainer.global_step, - "valid": callback_metrics["valid_CELoss"].detach(), - "valid_aa_precision": callback_metrics[ - "AA precision at coverage=1" - ].detach(), - "valid_pep_precision": callback_metrics[ - "Peptide precision at coverage=1" - ].detach(), + "valid": callback_metrics["valid_CELoss"].detach().item(), } + + if self.calculate_precision: + metrics["valid_aa_precision"] = ( + callback_metrics["AA precision at coverage=1"].detach().item() + ) + metrics["valid_pep_precision"] = ( + callback_metrics["Peptide precision at coverage=1"] + .detach() + .item() + ) self._history.append(metrics) self._log_history() @@ -893,19 +910,28 @@ def _log_history(self) -> None: if len(self._history) == 0: return if len(self._history) == 1: - logger.info( - "Step\tTrain loss\tValid loss\tPeptide precision\tAA precision" - ) + header = "Step\tTrain loss\tValid loss\t" + if self.calculate_precision: + header += "Peptide precision\tAA precision" + + logger.info(header) metrics = self._history[-1] if metrics["step"] % self.n_log == 0: - logger.info( - "%i\t%.6f\t%.6f\t%.6f\t%.6f", + msg = "%i\t%.6f\t%.6f" + vals = [ metrics["step"], metrics.get("train", np.nan), metrics.get("valid", np.nan), - metrics.get("valid_pep_precision", np.nan), - metrics.get("valid_aa_precision", np.nan), - ) + ] + + if self.calculate_precision: + msg += "\t%.6f\t%.6f" + vals += [ + metrics.get("valid_pep_precision", np.nan), + metrics.get("valid_aa_precision", np.nan), + ] + + logger.info(msg, *vals) if self.tb_summarywriter is not None: for descr, key in [ ("loss/train_crossentropy_loss", "train"), diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index a77f39b6..c622b345 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -13,6 +13,7 @@ import torch from depthcharge.data import AnnotatedSpectrumIndex, SpectrumIndex from lightning.pytorch.strategies import DDPStrategy +from lightning.pytorch.callbacks import ModelCheckpoint from ..config import Config from ..data import ms_io @@ -54,7 +55,7 @@ def __init__( # Configure checkpoints. if config.save_top_k is not None: self.callbacks = [ - pl.callbacks.ModelCheckpoint( + ModelCheckpoint( dirpath=config.model_save_folder_path, monitor="valid_CELoss", mode="min", @@ -143,7 +144,7 @@ def predict(self, peak_path: Iterable[str], output: str) -> None: ------- self """ - self.writer = ms_io.MztabWriter(f"{output}.mztab") + self.writer = ms_io.MztabWriter(Path(output).with_suffix(".mztab")) self.writer.set_metadata( self.config, model=str(self.model_filename), @@ -169,7 +170,9 @@ def initialize_trainer(self, train: bool) -> None: Determines whether to set the trainer up for model training or evaluation / inference. """ - logger = self.config.logger if self.config.logger is not None else False + logger = ( + self.config.logger if self.config.logger is not None else False + ) trainer_cfg = dict( accelerator=self.config.accelerator, devices=1, @@ -227,6 +230,7 @@ def initialize_model(self, train: bool) -> None: lr=self.config.learning_rate, weight_decay=self.config.weight_decay, out_writer=self.writer, + calculate_precision=self.config.calculate_precision, ) from_scratch = ( diff --git a/docs/getting_started.md b/docs/getting_started.md index 2f385e2d..729d2704 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -16,8 +16,8 @@ Once you have conda installed, you can use this helpful [cheat sheet](https://do ### Create a conda environment -Fist, open the terminal (MacOS and Linux) or the Anaconda Prompt (Windows). -All of the commands that follow should be entered this terminal or Anaconda Prompt window---that is, your *shell*. +First, open the terminal (MacOS and Linux) or the Anaconda Prompt (Windows). +All of the commands that follow should be entered into this terminal or Anaconda Prompt window---that is, your *shell*. To create a new conda environment for Casanovo, run the following: ```sh @@ -58,63 +58,69 @@ After installation, test that it was successful by viewing the Casanovo command ```sh casanovo --help ``` +![`casanovo --help`](images/help.svg) -All auxiliary data, model, and training-related parameters can be specified in a user created `.yaml` configuration file. -See [`casanovo/config.yaml`](https://github.com/Noble-Lab/casanovo/blob/main/casanovo/config.yaml) for the default configuration that was used to obtain the reported results. When running Casanovo in eval or denovo mode, you can change some of the parameters in this file, indicated with "(I)" in the file. You should not change other parameters unless you are training a new Casanovo model. + +All auxiliary data, model, and training-related parameters can be specified in a YAML configuration file. +To generate a YAML file containing the current Casanovo defaults, run: +```sh +casanovo configure +``` +![`casanovo configure --help`](images/configure-help.svg) + +When using Casanovo to sequence peptides from mass spectra or evaluate a previous model's performance, you can change some of the parameters in this file, indicated with "(I)" in the file. +The other parameters will not have an effect unless you are training a new Casanovo model. ### Download model weights -When running Casanovo in `denovo` or `eval` mode, Casanovo needs compatible pretrained model weights to make predictions. -Our model weights are uploaded with new Casanovo versions on the [Releases page](https://github.com/Noble-Lab/casanovo/releases) under the "Assets" for each release (file extension: .ckpt). -The model file can then be specified using the `--model` command-line parameter when executing Casanovo. -To assist users, if no model file is specified Casanovo will try to download and use a compatible model file automatically. +Using Casanovo to sequence peptides from new mass spectra, Casanovo needs compatible pretrained model weights to make its predictions. +By default, Casanovo will try to download the latest compatible model weights from GitHub when it is run. -Not all releases might have a model file included on the [Releases page](https://github.com/Noble-Lab/casanovo/releases), in which case model weights for alternative releases with the same major version number can be used. +However, our model weights are uploaded with new Casanovo versions on the [Releases page](https://github.com/Noble-Lab/casanovo/releases) under the "Assets" for each release (file extension: `.ckpt`). +This model file or a custom one can then be specified using the `--model` command-line parameter when executing Casanovo. + +Not all releases will have a model file included on the [Releases page](https://github.com/Noble-Lab/casanovo/releases), in which case model weights for alternative releases with the same major version number can be used. ## Running Casanovo ```{note} We recommend a Linux system with a dedicated GPU to achieve optimal runtime performance. -Notably, Casanovo is restricted to single-threaded execution only on Windows and MacOS. ``` -> **Warning** -> Casanovo can currently crash if no GPU is available. -> We are actively trying to fix this known issue. - ### Sequence new mass spectra -To sequence your own mass spectra with Casanovo, use the `denovo` mode: +To sequence your own mass spectra with Casanovo, use the `casanovo sequence` command: ```sh -casanovo --mode=denovo --peak_path=path/to/predict/spectra.mgf --output=path/to/output +casanovo sequence -o results.mztab spectra.mgf ``` +![`casanovo sequence --help`](images/sequence-help.svg) Casanovo can predict peptide sequences for MS/MS spectra in mzML, mzXML, and MGF files. This will write peptide predictions for the given MS/MS spectra to the specified output file in mzTab format. -> **Warning** -> If you are running inference with Casanovo on a system that has multiple GPUs, it is necessary to restrict Casanovo to (maximum) a single GPU. -> For example, for CUDA-capable GPUs, GPU visibility can be controlled by setting the `CUDA_VISIBLE_DEVICES` shell variable. - ### Evaluate *de novo* sequencing performance -To evaluate _de novo_ sequencing performance based on known mass spectrum annotations, run: +To evaluate _de novo_ sequencing performance based on known mass spectrum annotations, use the `casanovo evaluate` command: ```sh -casanovo --mode=eval --peak_path=path/to/test/annotated_spectra.mgf +casanovo evaluate annotated_spectra.mgf ``` +![`casanovo evaluate --help`](images/evaluate-help.svg) + -To evaluate the peptide predictions, ground truth peptide labels must to be provided as an annotated MGF file where the peptide sequence is denoted in the `SEQ` field. +To evaluate the peptide predictions, ground truth peptide labels must to be provided as an annotated MGF file where the peptide sequence is denoted in the `SEQ` field. +Compatible MGF files are available from [MassIVE-KB](https://massive.ucsd.edu/ProteoSAFe/static/massive-kb-libraries.jsp). ### Train a new model To train a model from scratch, run: ```sh -casanovo --mode=train --peak_path=path/to/train/annotated_spectra.mgf --peak_path_val=path/to/validation/annotated_spectra.mgf +casanovo train --validation_peak_path validation_spectra.mgf training_spectra.mgf ``` +![`casanovo train --help`](images/train-help.svg) Training and validation MS/MS data need to be provided as annotated MGF files, where the peptide sequence is denoted in the `SEQ` field. @@ -122,7 +128,7 @@ If a training is continued for a previously trained model, specify the starting ## Try Casanovo on a small example -Here, we demonstrate how to use Casanovo using a small collection of mass spectra in an MGF file (~100 MS/MS spectra). +Let's use Casanovo to sequence peptides from a small collection of mass spectra in an MGF file (~100 MS/MS spectra). The example MGF file is available at [`sample_data/sample_preprocessed_spectra.mgf`](https://github.com/Noble-Lab/casanovo/blob/main/sample_data/sample_preprocessed_spectra.mgf). To obtain *de novo* sequencing predictions for these spectra: @@ -131,7 +137,7 @@ To obtain *de novo* sequencing predictions for these spectra: 3. Ensure your Casanovo conda environment is activated by typing `conda activate casanovo_env`. (If you named your environment differently, type in that name instead.) 4. Sequence the mass spectra with Casanovo, replacing `[PATH_TO]` with the path to the example MGF file that you downloaded: ```sh -casanovo --mode=denovo --peak_path=[PATH_TO]/sample_preprocessed_spectra.mgf +casanovo sequence [PATH_TO]/sample_preprocessed_spectra.mgf ``` ```{note} diff --git a/docs/images/configure-help.svg b/docs/images/configure-help.svg new file mode 100644 index 00000000..fc4e6305 --- /dev/null +++ b/docs/images/configure-help.svg @@ -0,0 +1,107 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $ casanovo configure --help + +Usage: casanovo configure [OPTIONS] + + Generate a Casanovo configuration file to customize.                            + The casanovo configuration file is in the YAML format.                          + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--output-oFILE  The output configuration file.                            +--help-h  Show this message and exit.                               +╰──────────────────────────────────────────────────────────────────────────────╯ + + + + + diff --git a/docs/images/evaluate-help.svg b/docs/images/evaluate-help.svg new file mode 100644 index 00000000..d4832c98 --- /dev/null +++ b/docs/images/evaluate-help.svg @@ -0,0 +1,166 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $ casanovo evaluate --help + +Usage: casanovo evaluate [OPTIONSANNOTATED_PEAK_PATH... + + Evaluate de novo peptide sequencing performance.                                + ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       + provided by MassIVE-KB.                                                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*ANNOTATED_PEAK_PATHFILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--model-mFILE                        The model weights (.ckpt file).  +                                              If not provided, Casanovo will   +                                              try to download the latest       +                                              release.                         +--output-oFILE                        The mzTab file to which results  +                                              will be written.                 +--config-cFILE                        The YAML configuration file      +                                              overriding the default options.  +--verbosity-v[debug|info|warning|error]  Set the verbosity of console     +                                              logging messages. Log files are  +                                              always set to 'debug'.           +--help-h  Show this message and exit.      +╰──────────────────────────────────────────────────────────────────────────────╯ + + + + + diff --git a/docs/images/help.svg b/docs/images/help.svg new file mode 100644 index 00000000..72d06a85 --- /dev/null +++ b/docs/images/help.svg @@ -0,0 +1,200 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $ casanovo --help + +Usage: casanovo [OPTIONSCOMMAND [ARGS]... + + ╔════════════════════════════════════════════════════════════════════════════╗  + ║                                  Casanovo                                  ║  + ╚════════════════════════════════════════════════════════════════════════════╝  + Casanovo de novo sequences peptides from tandem mass spectra using a            + Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   + de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  + training new models.                                                            + + Links:                                                                          + + • Documentation: https://casanovo.readthedocs.io + • Official code repository: https://github.com/Noble-Lab/casanovo + + If you use Casanovo in your work, please cite:                                  + + • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   +mass spectrometry peptide sequencing with a transformer model. Proceedings   +of the 39th International Conference on Machine Learning - ICML '22 (2022)   +doi:10.1101/2022.02.07.479481.                                               + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--help-h    Show this message and exit.                                     +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Commands ───────────────────────────────────────────────────────────────────╮ +configure Generate a Casanovo configuration file to customize.               +evaluate  Evaluate de novo peptide sequencing performance.                   +sequence  De novo sequence peptides from tandem mass spectra.                +train     Train a Casanovo model on your own data.                           +version   Get the Casanovo version information                               +╰──────────────────────────────────────────────────────────────────────────────╯ + + + + + diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg new file mode 100644 index 00000000..01b002fd --- /dev/null +++ b/docs/images/sequence-help.svg @@ -0,0 +1,166 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $ casanovo sequence --help + +Usage: casanovo sequence [OPTIONSPEAK_PATH... + + De novo sequence peptides from tandem mass spectra.                             + PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence  + peptides.                                                                       + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*PEAK_PATHFILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--model-mFILE                        The model weights (.ckpt file).  +                                              If not provided, Casanovo will   +                                              try to download the latest       +                                              release.                         +--output-oFILE                        The mzTab file to which results  +                                              will be written.                 +--config-cFILE                        The YAML configuration file      +                                              overriding the default options.  +--verbosity-v[debug|info|warning|error]  Set the verbosity of console     +                                              logging messages. Log files are  +                                              always set to 'debug'.           +--help-h  Show this message and exit.      +╰──────────────────────────────────────────────────────────────────────────────╯ + + + + + diff --git a/docs/images/train-help.svg b/docs/images/train-help.svg new file mode 100644 index 00000000..e70940ed --- /dev/null +++ b/docs/images/train-help.svg @@ -0,0 +1,218 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $ casanovo train --help + +Usage: casanovo train [OPTIONSTRAIN_PEAK_PATH... + + Train a Casanovo model on your own data.                                        + TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  + by MassIVE-KB, from which to train a new Casnovo model.                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*TRAIN_PEAK_PATHFILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +*--validation_peak_pa…-pFILE                    An annotated MGF file   +                                                       for validation, like    +                                                       from MassIVE-KB. Use    +                                                       this option multiple    +                                                       times to specify        +                                                       multiple files.         +[required]             +--model-mFILE                    The model weights       +                                                       (.ckpt file). If not    +                                                       provided, Casanovo      +                                                       will try to download    +                                                       the latest release.     +--output-oFILE                    The mzTab file to       +                                                       which results will be   +                                                       written.                +--config-cFILE                    The YAML configuration  +                                                       file overriding the     +                                                       default options.        +--verbosity-v[debug|info|warning|er  Set the verbosity of    +ror]  console logging         +                                                       messages. Log files     +                                                       are always set to       +                                                       'debug'.                +--help-h  Show this message and   +                                                       exit.                   +╰──────────────────────────────────────────────────────────────────────────────╯ + + + + + diff --git a/pyproject.toml b/pyproject.toml index 629c0d29..7cb38674 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ classifiers = [ requires-python = ">=3.8" dependencies = [ "appdirs", + "lightning>=2.0", "click", "depthcharge-ms>=0.2.0,<0.3.0", "natsort", @@ -27,9 +28,9 @@ dependencies = [ "pandas", "psutil", "PyGithub", - "lightning>=2.0", "PyYAML", "requests", + "rich-click>=1.6.1", "scikit-learn", "spectrum_utils", "tensorboard", diff --git a/tests/conftest.py b/tests/conftest.py index c137c5f6..5eb55979 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -192,7 +192,7 @@ def tiny_config(tmp_path): "n_layers": 1, "warmup_iters": 1, "max_iters": 1, - "max_epochs": 10, + "max_epochs": 20, "every_n_train_steps": 1, "model_save_folder_path": str(tmp_path), "accelerator": "cpu", diff --git a/tests/test_integration.py b/tests/test_integration.py index 1f8f159b..e5d4b285 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,4 +1,5 @@ import functools +from pathlib import Path import pyteomics.mztab from click.testing import CliRunner @@ -19,35 +20,31 @@ def test_train_and_run( # Train a tiny model: train_args = [ - "--mode", "train", - "--peak_path", - mgf_small, - "--peak_path_val", - mgf_small, + "--validation_peak_path", + str(mgf_small), "--config", tiny_config, "--output", str(tmp_path / "train"), + str(mgf_small), # The training files. ] result = run(train_args) - model_file = tmp_path / "epoch=9-step=10.ckpt" + model_file = tmp_path / "epoch=19-step=20.ckpt" assert result.exit_code == 0 assert model_file.exists() # Try evaluating: eval_args = [ - "--mode", - "eval", - "--peak_path", - mgf_small, + "evaluate", "--model", - model_file, + str(model_file), "--config", - tiny_config, + str(tiny_config), "--output", str(tmp_path / "eval"), + str(mgf_small), ] result = run(eval_args) @@ -56,18 +53,15 @@ def test_train_and_run( # Finally try predicting: output_filename = tmp_path / "test.mztab" predict_args = [ - "--mode", - "denovo", - "--peak_path", - mgf_small, - "--peak_path", - mzml_small, + "sequence", "--model", - model_file, + str(model_file), "--config", tiny_config, "--output", str(output_filename), + str(mgf_small), + str(mzml_small), ] result = run(predict_args) @@ -91,3 +85,20 @@ def test_train_and_run( assert psms.loc[3, "spectra_ref"] == "ms_run[2]:scan=17" assert psms.loc[4, "sequence"] == "PEPTLDEK" assert psms.loc[4, "spectra_ref"] == "ms_run[2]:scan=111" + + +def test_auxilliary_cli(tmp_path, monkeypatch): + """Test the secondary CLI commands""" + run = functools.partial( + CliRunner().invoke, casanovo.main, catch_exceptions=False + ) + + monkeypatch.chdir(tmp_path) + run("configure") + assert Path("casanovo.yaml").exists() + + run(["configure", "-o", "test.yaml"]) + assert Path("test.yaml").exists() + + res = run("version") + assert res.output diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index 9d9f2497..663e3b3b 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -88,3 +88,27 @@ def test_save_and_load_weights(tmp_path, mgf_small, tiny_config): # Should work: with ModelRunner(config=config, model_filename=str(ckpt)) as runner: runner.evaluate([mgf_small]) + + +def test_calculate_precision(tmp_path, mgf_small, tiny_config): + """Test that this parameter is working correctly.""" + config = Config(tiny_config) + config.n_layers = 1 + config.max_epochs = 1 + config.calculate_precision = False + config.tb_summarywriter = str(tmp_path) + + runner = ModelRunner(config=config) + with runner: + runner.train([mgf_small], [mgf_small]) + + assert "valid_aa_precision" not in runner.model.history.columns + assert "valid_pep_precision" not in runner.model.history.columns + + config.calculate_precision = True + runner = ModelRunner(config=config) + with runner: + runner.train([mgf_small], [mgf_small]) + + assert "valid_aa_precision" in runner.model.history.columns + assert "valid_pep_precision" in runner.model.history.columns diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 97898ae5..efa89a05 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -341,36 +341,35 @@ def test_beam_search_decode(): ) assert torch.equal(pred_cache[0][0][-1], torch.tensor([4, 14, 4, 13])) - + # Test _get_topk_beams(). - step=1 + step = 1 scores = torch.full( size=(batch, length, vocab, beam), fill_value=torch.nan - ) + ) scores = einops.rearrange(scores, "B L V S -> (B S) L V") tokens = torch.zeros(batch * beam, length, dtype=torch.int64) - tokens[0,0] = 4 + tokens[0, 0] = 4 scores[0, step, :] = 0 - scores[0, step, 14] = torch.tensor([1]) + scores[0, step, 14] = torch.tensor([1]) test_finished_beams = torch.tensor([False]) - + new_tokens, new_scores = model._get_topk_beams( tokens, scores, test_finished_beams, batch, step ) - + expected_tokens = torch.tensor( [ [4, 14], ] - ) - + ) + expected_scores = torch.zeros(beam, vocab) - expected_scores[:, 14] = torch.tensor([1]) - - assert torch.equal(new_scores[:, step, :], expected_scores) + expected_scores[:, 14] = torch.tensor([1]) + + assert torch.equal(new_scores[:, step, :], expected_scores) assert torch.equal(new_tokens[:, : step + 1], expected_tokens) - - + # Test _finish_beams() for tokens with a negative mass. model = Spec2Pep(n_beams=2, residues="massivekb") beam = model.n_beams # S From 8d2593bfad4b6e48a910501957d04d438c4fc207 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Thu, 3 Aug 2023 10:15:57 +0200 Subject: [PATCH 13/30] Remove Pytorch Lightning logger (#220) * Remove Pytorch Lightning logger Fixes #219. * Update changelog * Clarify tb_summarywriter --- CHANGELOG.md | 18 ++++++++---------- casanovo/config.yaml | 4 +--- casanovo/denovo/model_runner.py | 4 ---- 3 files changed, 9 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ddf1891d..56f1d640 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,16 +6,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] -### Changed - -- The CLI has been overhauled to use subcommands. -- Upgraded to Lightning >=2.0 -- Checkpointing is now configured to save the top-k models instead of all. - -### Fixed - -- Casanovo now runs on CPU and can passes all tests. - ### Added - Checkpoints now include model parameters, allowing for mismatches with the provided configuration file. @@ -25,12 +15,20 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Changed +- The CLI has been overhauled to use subcommands. +- Upgraded to Lightning >=2.0 +- Checkpointing is now configured to save the top-k models instead of all. - We now log steps rather than epochs as units of progress during training. - Validation performance metrics are logged (and added to tensorboard) at the validation epoch, and training loss is logged at the end of training epoch, i.e. training and validation metrics are logged asynchronously. - Irrelevant warning messages on the console output and in the log file are no longer shown. +### Removed + +- Remove config option for a custom Pytorch Lightning logger + ### Fixed +- Casanovo now runs on CPU and can passes all tests. - Upgrade to Depthcharge v0.2.0 to fix sinusoidal encoding. - Correctly refer to input peak files by their full file path. diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 46fa63ea..9700a80f 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -83,7 +83,7 @@ residues: "+43.006-17.027": 25.980265 # Carbamylation and NH3 loss # Logging frequency in training steps n_log: 1 -# Tensorboard object to keep track of training metrics +# Tensorboard directory to use for keeping track of training metrics tb_summarywriter: # Number of warmup iterations for learning rate scheduler warmup_iters: 100_000 @@ -103,8 +103,6 @@ predict_batch_size: 1024 n_beams: 1 # Number of PSMs for each spectrum (I) top_match: 1 -# Object for logging training progress -logger: # Max number of training epochs max_epochs: 30 # Number of validation steps to run before training begins diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index c622b345..ec515519 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -170,14 +170,10 @@ def initialize_trainer(self, train: bool) -> None: Determines whether to set the trainer up for model training or evaluation / inference. """ - logger = ( - self.config.logger if self.config.logger is not None else False - ) trainer_cfg = dict( accelerator=self.config.accelerator, devices=1, enable_checkpointing=False, - logger=logger, ) if train: From e1e1bb075c890d510f36676cc76264b9bf3f93db Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Thu, 3 Aug 2023 10:17:51 +0200 Subject: [PATCH 14/30] Nicely format logged warnings (#223) * Format how warnings are logged Fixes #222. * Update changelog * Fix linting --- CHANGELOG.md | 1 + casanovo/casanovo.py | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 56f1d640..ed050532 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - We now log steps rather than epochs as units of progress during training. - Validation performance metrics are logged (and added to tensorboard) at the validation epoch, and training loss is logged at the end of training epoch, i.e. training and validation metrics are logged asynchronously. - Irrelevant warning messages on the console output and in the log file are no longer shown. +- Nicely format logged warnings. ### Removed diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 5c2a8bdc..992cf566 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -10,6 +10,9 @@ from pathlib import Path from typing import Optional, Tuple +warnings.formatwarning = lambda message, category, *args, **kwargs: ( + f"{category.__name__}: {message}" +) warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings( "ignore", @@ -283,8 +286,9 @@ def setup_logging( # Configure logging. logging.captureWarnings(True) - root = logging.getLogger() - root.setLevel(logging.DEBUG) + root_logger = logging.getLogger() + root_logger.setLevel(logging.DEBUG) + warnings_logger = logging.getLogger("py.warnings") # Formatters for file vs console: console_formatter = logging.Formatter("{levelname}: {message}", style="{") @@ -297,13 +301,17 @@ def setup_logging( console_handler = logging.StreamHandler(sys.stderr) console_handler.setLevel(logging_levels[verbosity.lower()]) console_handler.setFormatter(console_formatter) - root.addHandler(console_handler) + root_logger.addHandler(console_handler) + warnings_logger.addHandler(console_handler) file_handler = logging.FileHandler(output.with_suffix(".log")) file_handler.setFormatter(log_formatter) - root.addHandler(file_handler) + root_logger.addHandler(file_handler) + warnings_logger.addHandler(file_handler) # Disable dependency non-critical log messages. - logging.getLogger("depthcharge").setLevel(logging.INFO) + logging.getLogger("depthcharge").setLevel( + logging_levels[verbosity.lower()] + ) logging.getLogger("fsspec").setLevel(logging.WARNING) logging.getLogger("github").setLevel(logging.WARNING) logging.getLogger("h5py").setLevel(logging.WARNING) From 3ac0887f9eb79712870ddd8a582949d36fdb779c Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Thu, 3 Aug 2023 10:20:43 +0200 Subject: [PATCH 15/30] Fix validation and checkpointing interval (#224) * Rename every_n_train_steps to val_check_interval * Disable check_val_every_n_epochs * Update changelog --- CHANGELOG.md | 3 ++- casanovo/config.py | 2 +- casanovo/config.yaml | 2 +- casanovo/denovo/model_runner.py | 3 ++- docs/faq.md | 4 ++-- tests/conftest.py | 2 +- 6 files changed, 9 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ed050532..a94acf9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Checkpoints now include model parameters, allowing for mismatches with the provided configuration file. - `accelerator` parameter now controls the accelerator (CPU, GPU, etc) that is used. - `devices` parameter controls the number of accelerators used. -- `every_n_train_steps` parameter now controls the frequency of both validation epochs and model checkpointing during training. +- `val_check_interval` parameter now controls the frequency of both validation epochs and model checkpointing during training. ### Changed @@ -22,6 +22,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Validation performance metrics are logged (and added to tensorboard) at the validation epoch, and training loss is logged at the end of training epoch, i.e. training and validation metrics are logged asynchronously. - Irrelevant warning messages on the console output and in the log file are no longer shown. - Nicely format logged warnings. +- `every_n_train_steps` has been renamed to `val_check_interval` in accordance to the corresponding Pytorch Lightning parameter. ### Removed diff --git a/casanovo/config.py b/casanovo/config.py index 7aec89bc..f2d24e3b 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -65,7 +65,7 @@ class Config: train_from_scratch=bool, save_top_k=int, model_save_folder_path=str, - every_n_train_steps=int, + val_check_interval=int, accelerator=str, devices=int, calculate_precision=bool, diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 9700a80f..13e9373d 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -115,7 +115,7 @@ save_top_k: 5 # Path to saved checkpoints model_save_folder_path: "" # Model validation and checkpointing frequency in training steps -every_n_train_steps: 50_000 +val_check_interval: 50_000 # Calculate peptide and amino acid precision during training. this # is expensive, so we recommend against it. calculate_precision: False diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index ec515519..cbefd849 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -189,7 +189,8 @@ def initialize_trainer(self, train: bool) -> None: max_epochs=self.config.max_epochs, num_sanity_val_steps=self.config.num_sanity_val_steps, strategy=self._get_strategy(), - val_check_interval=self.config.every_n_train_steps, + val_check_interval=self.config.val_check_interval, + check_val_every_n_epoch=None, ) trainer_cfg.update(additional_cfg) diff --git a/docs/faq.md b/docs/faq.md index 269d09bf..3dac6858 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -60,8 +60,8 @@ Using the filename (column "filename") you can then retrieve the corresponding p By default, Casanovo saves a snapshot of the model weights after every 50,000 training steps. Note that the number of samples that are processed during a single training step depends on the batch size. -Therefore, when using the default training batch size of 32, this correspond to saving a model snapshot after every 1.6 million training samples. -You can optionally modify the snapshot frequency in the [config file](https://github.com/Noble-Lab/casanovo/blob/main/casanovo/config.yaml) (parameter `every_n_train_steps`), depending on your dataset size. +Therefore, when using the default training batch size of 32, this corresponds to saving a model snapshot after every 1.6 million training samples. +You can optionally modify the snapshot (and validation) frequency in the [config file](https://github.com/Noble-Lab/casanovo/blob/main/casanovo/config.yaml) (parameter `val_check_interval`), depending on your dataset size. Note that taking very frequent model snapshots will result in somewhat slower training time because Casanovo will evaluate its performance on the validation data for every snapshot. When saving a model snapshot, Casanovo will use the validation data to compute performance measures (training loss, validation loss, amino acid precision, and peptide precision) and print this information to the console and log file. diff --git a/tests/conftest.py b/tests/conftest.py index 5eb55979..6dcda9c9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -193,7 +193,7 @@ def tiny_config(tmp_path): "warmup_iters": 1, "max_iters": 1, "max_epochs": 20, - "every_n_train_steps": 1, + "val_check_interval": 1, "model_save_folder_path": str(tmp_path), "accelerator": "cpu", } From 514db80ab5fc6963d8a15daf89acd2e4200d9cfa Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Wed, 16 Aug 2023 11:10:32 +0200 Subject: [PATCH 16/30] Fix custom residues in config (#229) * Fix specifying custom residues * Update changelog --- CHANGELOG.md | 1 + casanovo/config.py | 3 ++- tests/unit_tests/test_config.py | 16 +++++++++++++--- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a94acf9c..f600c7b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Casanovo now runs on CPU and can passes all tests. - Upgrade to Depthcharge v0.2.0 to fix sinusoidal encoding. - Correctly refer to input peak files by their full file path. +- Specifying custom residues to retrain Casanovo is now possible. ## [3.3.0] - 2023-04-04 diff --git a/casanovo/config.py b/casanovo/config.py index f2d24e3b..0274a1b1 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -50,6 +50,7 @@ class Config: dropout=float, dim_intensity=int, max_length=int, + residues=dict, n_log=int, tb_summarywriter=str, warmup_iters=int, @@ -66,9 +67,9 @@ class Config: save_top_k=int, model_save_folder_path=str, val_check_interval=int, + calculate_precision=bool, accelerator=str, devices=int, - calculate_precision=bool, ) def __init__(self, config_file: Optional[str] = None): diff --git a/tests/unit_tests/test_config.py b/tests/unit_tests/test_config.py index 8da26f8c..1e2ef338 100644 --- a/tests/unit_tests/test_config.py +++ b/tests/unit_tests/test_config.py @@ -1,6 +1,4 @@ """Test configuration loading""" -import pytest - from casanovo.config import Config @@ -17,11 +15,23 @@ def test_override(tmp_path): """Test overriding the default""" yml = tmp_path / "test.yml" with yml.open("w+") as f_out: - f_out.write("random_seed: 42\ntop_match: 3") + f_out.write( + """random_seed: 42 +top_match: 3 +residues: + W: 1 + O: 2 + U: 3 + T: 4 +""" + ) config = Config(yml) assert config.random_seed == 42 assert config["random_seed"] == 42 assert config.accelerator == "auto" assert config.top_match == 3 + assert len(config.residues) == 4 + for i, residue in enumerate("WOUT", 1): + assert config["residues"][residue] == i assert config.file == str(yml) From bbce3307d1ca9cc75fa49b6fa16e73ee9116985d Mon Sep 17 00:00:00 2001 From: cfmelend Date: Thu, 17 Aug 2023 14:38:53 -0700 Subject: [PATCH 17/30] Added shuffling to trainset dataloaders, no shuffle by default for other dataloaders. --- casanovo/denovo/dataloaders.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 7ab78355..998fa66a 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -126,6 +126,7 @@ def _make_loader( self, dataset: torch.utils.data.Dataset, batch_size: int, + shuffle: bool = False, ) -> torch.utils.data.DataLoader: """ Create a PyTorch DataLoader. @@ -136,6 +137,8 @@ def _make_loader( A PyTorch Dataset. batch_size : int The batch size to use. + shuffle : bool + Option to shuffle the batches. Returns ------- @@ -148,11 +151,14 @@ def _make_loader( collate_fn=prepare_batch, pin_memory=True, num_workers=self.n_workers, + shuffle=shuffle, ) def train_dataloader(self) -> torch.utils.data.DataLoader: """Get the training DataLoader.""" - return self._make_loader(self.train_dataset, self.train_batch_size) + return self._make_loader( + self.train_dataset, self.train_batch_size, shuffle=True + ) def val_dataloader(self) -> torch.utils.data.DataLoader: """Get the validation DataLoader.""" From d9396aab3bebd3c178b29af682bf2151298276d4 Mon Sep 17 00:00:00 2001 From: Varun Ananth Date: Fri, 18 Aug 2023 02:43:37 -0700 Subject: [PATCH 18/30] Force gradient calculation during inference (#231) * Add lines to force gradient calculation. During inference only. * Add comments in the code to document the temporary workaround * Update changelog --------- Co-authored-by: Varun Ananth Co-authored-by: Wout Bittremieux Co-authored-by: Wout Bittremieux --- CHANGELOG.md | 11 +++++++++ casanovo/denovo/model.py | 48 ++++++++++++++++++++++------------------ 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f600c7b1..b0e22c71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] +### Changed + +- The CLI has been overhauled to use subcommands. +- Upgraded to Lightning >=2.0 +- Checkpointing is now configured to save the top-k models instead of all. + +### Fixed + +- Casanovo now runs on CPU and can passes all tests. +- Enable gradients during prediction and validation to avoid NaNs from occuring as a temporary workaround until a new Pytorch version is available. + ### Added - Checkpoints now include model parameters, allowing for mismatches with the provided configuration file. diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 1d609ad7..23eeac92 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -758,7 +758,9 @@ def validation_step( The loss of the validation step. """ # Record the loss. - loss = self.training_step(batch, mode="valid") + # FIXME: Temporary workaround to avoid the NaN bug. + with torch.set_grad_enabled(True): + loss = self.training_step(batch, mode="valid") if not self.calculate_precision: return loss @@ -809,28 +811,30 @@ def predict_step( and amino acid-level confidence scores. """ predictions = [] - for ( - precursor_charge, - precursor_mz, - spectrum_i, - spectrum_preds, - ) in zip( - batch[1][:, 1].cpu().detach().numpy(), - batch[1][:, 2].cpu().detach().numpy(), - batch[2], - self.forward(batch[0], batch[1]), - ): - for peptide_score, aa_scores, peptide in spectrum_preds: - predictions.append( - ( - spectrum_i, - precursor_charge, - precursor_mz, - peptide, - peptide_score, - aa_scores, + # FIXME: Temporary workaround to avoid the NaN bug. + with torch.set_grad_enabled(True): + for ( + precursor_charge, + precursor_mz, + spectrum_i, + spectrum_preds, + ) in zip( + batch[1][:, 1].cpu().detach().numpy(), + batch[1][:, 2].cpu().detach().numpy(), + batch[2], + self.forward(batch[0], batch[1]), + ): + for peptide_score, aa_scores, peptide in spectrum_preds: + predictions.append( + ( + spectrum_i, + precursor_charge, + precursor_mz, + peptide, + peptide_score, + aa_scores, + ) ) - ) return predictions From 82be5ae633809b828aeb7025d84033db0c3a0234 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Fri, 18 Aug 2023 11:51:10 +0200 Subject: [PATCH 19/30] Document batch shuffling in the changelog And fix duplicate changelog organization. --- CHANGELOG.md | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b0e22c71..16d9f84e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,38 +6,33 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] -### Changed - -- The CLI has been overhauled to use subcommands. -- Upgraded to Lightning >=2.0 -- Checkpointing is now configured to save the top-k models instead of all. - -### Fixed - -- Casanovo now runs on CPU and can passes all tests. -- Enable gradients during prediction and validation to avoid NaNs from occuring as a temporary workaround until a new Pytorch version is available. - ### Added -- Checkpoints now include model parameters, allowing for mismatches with the provided configuration file. -- `accelerator` parameter now controls the accelerator (CPU, GPU, etc) that is used. +- Checkpoints include model parameters, allowing for mismatches with the provided configuration file. +- `accelerator` parameter controls the accelerator (CPU, GPU, etc) that is used. - `devices` parameter controls the number of accelerators used. -- `val_check_interval` parameter now controls the frequency of both validation epochs and model checkpointing during training. +- `val_check_interval` parameter controls the frequency of both validation epochs and model checkpointing during training. ### Changed - The CLI has been overhauled to use subcommands. - Upgraded to Lightning >=2.0 -- Checkpointing is now configured to save the top-k models instead of all. -- We now log steps rather than epochs as units of progress during training. +- Checkpointing is configured to save the top-k models instead of all. +- Log steps rather than epochs as units of progress during training. - Validation performance metrics are logged (and added to tensorboard) at the validation epoch, and training loss is logged at the end of training epoch, i.e. training and validation metrics are logged asynchronously. - Irrelevant warning messages on the console output and in the log file are no longer shown. - Nicely format logged warnings. - `every_n_train_steps` has been renamed to `val_check_interval` in accordance to the corresponding Pytorch Lightning parameter. +- Training batches are randomly shuffled. + +### Fixed + +- Casanovo runs on CPU and can passes all tests. +- Enable gradients during prediction and validation to avoid NaNs from occuring as a temporary workaround until a new Pytorch version is available. ### Removed -- Remove config option for a custom Pytorch Lightning logger +- Remove config option for a custom Pytorch Lightning logger. ### Fixed From 727ead674be2e6b806923ea2da35705dd4c0be2b Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Mon, 21 Aug 2023 19:22:30 +0200 Subject: [PATCH 20/30] Upgrade depthcharge to v0.2.3 (#235) --- CHANGELOG.md | 1 + pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 16d9f84e..e3d0e272 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Casanovo runs on CPU and can passes all tests. - Enable gradients during prediction and validation to avoid NaNs from occuring as a temporary workaround until a new Pytorch version is available. +- Upgrade to depthcharge v0.2.3 for `PeptideTransformerDecoder` hotfix. ### Removed diff --git a/pyproject.toml b/pyproject.toml index 7cb38674..2b836a41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ "appdirs", "lightning>=2.0", "click", - "depthcharge-ms>=0.2.0,<0.3.0", + "depthcharge-ms>=0.2.3,<0.3.0", "natsort", "numpy", "pandas", From 86630e3f1cf26f0746a4c6aabe8cbf582540c43a Mon Sep 17 00:00:00 2001 From: William Stafford Noble Date: Thu, 24 Aug 2023 10:19:02 -0700 Subject: [PATCH 21/30] Edits to config file. (#237) --- casanovo/config.yaml | 132 +++++++++++++++++++++++-------------------- 1 file changed, 71 insertions(+), 61 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 13e9373d..dedb1740 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -1,16 +1,57 @@ ### # Casanovo configuration. # Blank entries are interpreted as "None". -# Parameters that can be modified when running inference with Casanovo, -# i.e. denovo and eval modes in the command line interface, are marked with -# "(I)". Other parameters shouldn't be changed unless a new Casanovo model -# is being trained. ### -# Random seed to ensure reproducible results. +### +# The following parameters can be modified when running inference or +# when fine-tuning an existing Casanovo model. +### + +# Max absolute difference allowed with respect to observed precursor m/z +# Predictions outside the tolerance range are assigned a negative peptide score. +precursor_mass_tol: 50 # ppm +# Isotopes to consider when comparing predicted and observed precursor m/z's +isotope_error_range: [0, 1] +# The minimum length of predicted peptides +min_peptide_len: 6 +# Number of spectra in one inference batch +predict_batch_size: 1024 +# Number of beams used in beam search +n_beams: 1 +# Number of PSMs for each spectrum +top_match: 1 +# The hardware accelerator to use. Must be one of: +# "cpu", "gpu", "tpu", "ipu", "hpu", "mps", or "auto" +accelerator: "auto" +# The devices to use. Can be set to a positive number int, +# or the value -1 to indicate all available devices should be used, +# If left empty, the appropriate number will be automatically +# selected for automatic selected on the chosen accelerator. +devices: + +### +# The following parameters should only be modified if you are training a new +# Casanovo model from scratch. +### + +# Random seed to ensure reproducible results random_seed: 454 -# Spectrum processing options. +# OUTPUT OPTIONS +# Logging frequency in training steps +n_log: 1 +# Tensorboard directory to use for keeping track of training metrics +tb_summarywriter: +# Save the top k model checkpoints during training. -1 saves all, and +# leaving this field empty saves none. +save_top_k: 5 +# Path to saved checkpoints +model_save_folder_path: "" +# Model validation and checkpointing frequency in training steps +val_check_interval: 50_000 + +# SPECTRUM PROCESSING OPTIONS # Number of the most intense peaks to retain, any remaining peaks are discarded n_peaks: 150 # Min peak m/z allowed, peaks with smaller m/z are discarded @@ -23,15 +64,8 @@ min_intensity: 0.01 remove_precursor_tol: 2.0 # Da # Max precursor charge allowed, spectra with larger charge are skipped max_charge: 10 -# Max absolute difference allowed with respect to observed precursor m/z (I) -# Predictions outside the tolerance range are assinged a negative peptide score -precursor_mass_tol: 50 # ppm -# Isotopes to consider when comparing predicted and observed precursor m/z's (I) -isotope_error_range: [0, 1] -# The minimum length of predicted peptides (I). -min_peptide_len: 6 -# Model architecture options. +# MODEL ARCHITECTURE OPTIONS # Dimensionality of latent representations, i.e. peak embeddings dim_model: 512 # Number of attention heads @@ -50,7 +84,29 @@ dim_intensity: custom_encoder: # Max decoded peptide length max_length: 100 -# Amino acid and modification vocabulary to use +# Number of warmup iterations for learning rate scheduler +warmup_iters: 100_000 +# Max number of iterations for learning rate scheduler +max_iters: 600_000 +# Learning rate for weight updates during training +learning_rate: 5e-4 +# Regularization term for weight updates +weight_decay: 1e-5 + +# TRAINING/INFERENCE OPTIONS +# Number of spectra in one training batch +train_batch_size: 32 +# Max number of training epochs +max_epochs: 30 +# Number of validation steps to run before training begins +num_sanity_val_steps: 0 +# Set to "False" to further train a pre-trained Casanovo model +train_from_scratch: True +# Calculate peptide and amino acid precision during training. this +# is expensive, so we recommend against it. +calculate_precision: False + +# AMINO ACID AND MODIFICATION VOCABULARY residues: "G": 57.021464 "A": 71.037114 @@ -81,49 +137,3 @@ residues: "+43.006": 43.005814 # Carbamylation "-17.027": -17.026549 # NH3 loss "+43.006-17.027": 25.980265 # Carbamylation and NH3 loss -# Logging frequency in training steps -n_log: 1 -# Tensorboard directory to use for keeping track of training metrics -tb_summarywriter: -# Number of warmup iterations for learning rate scheduler -warmup_iters: 100_000 -# Max number of iterations for learning rate scheduler -max_iters: 600_000 -# Learning rate for weight updates during training -learning_rate: 5e-4 -# Regularization term for weight updates -weight_decay: 1e-5 - -# Training/inference options. -# Number of spectra in one training batch -train_batch_size: 32 -# Number of spectra in one inference batch (I) -predict_batch_size: 1024 -# Number of beams used in beam search (I) -n_beams: 1 -# Number of PSMs for each spectrum (I) -top_match: 1 -# Max number of training epochs -max_epochs: 30 -# Number of validation steps to run before training begins -num_sanity_val_steps: 0 -# Set to "False" to further train a pre-trained Casanovo model -train_from_scratch: True -# Save the top k model checkpoints during training. -1 saves all and -# leaving this field empty saves none. -save_top_k: 5 -# Path to saved checkpoints -model_save_folder_path: "" -# Model validation and checkpointing frequency in training steps -val_check_interval: 50_000 -# Calculate peptide and amino acid precision during training. this -# is expensive, so we recommend against it. -calculate_precision: False -# The hardware accelerator to use. Must be one of: -# "cpu", "gpu", "tpu", "ipu", "hpu", "mps", or "auto" -accelerator: "auto" -# The devices to use. Can be set to a positive number int, -# or the value -1 to indicate all available devices should be used, -# If left empty, the appropriate number will be automatically -# selected for automatic selected on the chosen accelerator. -devices: From 9a4463011a95a9e0f0cfc1cc6a24368d0813f990 Mon Sep 17 00:00:00 2001 From: ishagokhale <72827684+ishagokhale@users.noreply.github.com> Date: Thu, 19 Oct 2023 10:38:39 -0700 Subject: [PATCH 22/30] Remove unused custom_encoder option (#254) * resolves issue #238: remove custom_encoder option * fixed lint issue * fixed lint issue * Revert "fixed lint issue" This reverts commit bd1366c6040bcd7cf9c81edd481b8bb68298f0f1. * lint * lint issue * Consistently format changelog. --------- Co-authored-by: Isha Gokhale Co-authored-by: Wout Bittremieux --- CHANGELOG.md | 12 ++++-------- casanovo/config.yaml | 3 --- casanovo/denovo/model.py | 23 ++++++++--------------- casanovo/denovo/model_runner.py | 1 - 4 files changed, 12 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e3d0e272..73dd7510 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,22 +25,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - `every_n_train_steps` has been renamed to `val_check_interval` in accordance to the corresponding Pytorch Lightning parameter. - Training batches are randomly shuffled. -### Fixed - -- Casanovo runs on CPU and can passes all tests. -- Enable gradients during prediction and validation to avoid NaNs from occuring as a temporary workaround until a new Pytorch version is available. -- Upgrade to depthcharge v0.2.3 for `PeptideTransformerDecoder` hotfix. - ### Removed - Remove config option for a custom Pytorch Lightning logger. +- Remove superfluous `custom_encoder` config option. ### Fixed -- Casanovo now runs on CPU and can passes all tests. -- Upgrade to Depthcharge v0.2.0 to fix sinusoidal encoding. +- Casanovo runs on CPU and can pass all tests. - Correctly refer to input peak files by their full file path. - Specifying custom residues to retrain Casanovo is now possible. +- Upgrade to depthcharge v0.2.3 to fix sinusoidal encoding and for the `PeptideTransformerDecoder` hotfix. +- Enable gradients during prediction and validation to avoid NaNs from occuring as a temporary workaround until a new Pytorch version is available. ## [3.3.0] - 2023-04-04 diff --git a/casanovo/config.yaml b/casanovo/config.yaml index dedb1740..729e827d 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -79,9 +79,6 @@ dropout: 0.0 # Number of dimensions to use for encoding peak intensity # Projected up to ``dim_model`` by default and summed with the peak m/z encoding dim_intensity: -# Option to provide a pre-trained spectrum encoder when training -# Trained from scratch by default -custom_encoder: # Max decoded peptide length max_length: 100 # Number of warmup iterations for learning rate scheduler diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 23eeac92..e3f5655d 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -43,9 +43,6 @@ class Spec2Pep(pl.LightningModule, ModelMixin): (``dim_model - dim_intensity``) are reserved for encoding the m/z value. If ``None``, the intensity will be projected up to ``dim_model`` using a linear layer, then summed with the m/z encoding for each peak. - custom_encoder : Optional[Union[SpectrumEncoder, PairedSpectrumEncoder]] - A pretrained encoder to use. The ``dim_model`` of the encoder must be - the same as that specified by the ``dim_model`` parameter here. max_length : int The maximum peptide length to decode. residues: Union[Dict[str, float], str] @@ -97,7 +94,6 @@ def __init__( n_layers: int = 9, dropout: float = 0.0, dim_intensity: Optional[int] = None, - custom_encoder: Optional[SpectrumEncoder] = None, max_length: int = 100, residues: Union[Dict[str, float], str] = "canonical", max_charge: int = 5, @@ -120,17 +116,14 @@ def __init__( self.save_hyperparameters() # Build the model. - if custom_encoder is not None: - self.encoder = custom_encoder - else: - self.encoder = SpectrumEncoder( - dim_model=dim_model, - n_head=n_head, - dim_feedforward=dim_feedforward, - n_layers=n_layers, - dropout=dropout, - dim_intensity=dim_intensity, - ) + self.encoder = SpectrumEncoder( + dim_model=dim_model, + n_head=n_head, + dim_feedforward=dim_feedforward, + n_layers=n_layers, + dropout=dropout, + dim_intensity=dim_intensity, + ) self.decoder = PeptideDecoder( dim_model=dim_model, n_head=n_head, diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index cbefd849..852860d3 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -212,7 +212,6 @@ def initialize_model(self, train: bool) -> None: n_layers=self.config.n_layers, dropout=self.config.dropout, dim_intensity=self.config.dim_intensity, - custom_encoder=self.config.custom_encoder, max_length=self.config.max_length, residues=self.config.residues, max_charge=self.config.max_charge, From 3a18fed2c5c6e89beea204217db4f75d9b2ee5a1 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 24 Oct 2023 07:53:33 +0200 Subject: [PATCH 23/30] Correctly report AA precision and recall during validation (#253) Fixes #252. Co-authored-by: Melih Yilmaz <32707537+melihyilmaz@users.noreply.github.com> --- CHANGELOG.md | 1 + casanovo/denovo/model.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73dd7510..628da137 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Specifying custom residues to retrain Casanovo is now possible. - Upgrade to depthcharge v0.2.3 to fix sinusoidal encoding and for the `PeptideTransformerDecoder` hotfix. - Enable gradients during prediction and validation to avoid NaNs from occuring as a temporary workaround until a new Pytorch version is available. +- Correctly report amino acid precision and recall during validation. ## [3.3.0] - 2023-04-04 diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index e3f5655d..5851aa76 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -766,8 +766,8 @@ def validation_step( aa_precision, _, pep_precision = evaluate.aa_match_metrics( *evaluate.aa_match_batch( - peptides_pred, peptides_true, + peptides_pred, self.decoder._peptide_mass.masses, ) ) From 7721962e6dbd49de81ddc87175ee492611081af9 Mon Sep 17 00:00:00 2001 From: Melih Yilmaz <32707537+melihyilmaz@users.noreply.github.com> Date: Tue, 24 Oct 2023 01:12:10 -0700 Subject: [PATCH 24/30] Remove gradient calculation during inference (#258) * Remove force_grad in inference * Upgrade required PyTorch version * Update CHANGELOG.md * Update CHANGELOG.md * Fix typo in torch version * Specify correct Pytorch version change --------- Co-authored-by: Wout Bittremieux --- CHANGELOG.md | 4 ++-- casanovo/denovo/model.py | 48 ++++++++++++++++++---------------------- pyproject.toml | 2 +- 3 files changed, 25 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 628da137..08b77443 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Changed - The CLI has been overhauled to use subcommands. -- Upgraded to Lightning >=2.0 +- Upgraded to Lightning >=2.0. - Checkpointing is configured to save the top-k models instead of all. - Log steps rather than epochs as units of progress during training. - Validation performance metrics are logged (and added to tensorboard) at the validation epoch, and training loss is logged at the end of training epoch, i.e. training and validation metrics are logged asynchronously. @@ -24,6 +24,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Nicely format logged warnings. - `every_n_train_steps` has been renamed to `val_check_interval` in accordance to the corresponding Pytorch Lightning parameter. - Training batches are randomly shuffled. +- Upgraded to Torch >=2.1. ### Removed @@ -36,7 +37,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Correctly refer to input peak files by their full file path. - Specifying custom residues to retrain Casanovo is now possible. - Upgrade to depthcharge v0.2.3 to fix sinusoidal encoding and for the `PeptideTransformerDecoder` hotfix. -- Enable gradients during prediction and validation to avoid NaNs from occuring as a temporary workaround until a new Pytorch version is available. - Correctly report amino acid precision and recall during validation. ## [3.3.0] - 2023-04-04 diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 5851aa76..26766ead 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -751,9 +751,7 @@ def validation_step( The loss of the validation step. """ # Record the loss. - # FIXME: Temporary workaround to avoid the NaN bug. - with torch.set_grad_enabled(True): - loss = self.training_step(batch, mode="valid") + loss = self.training_step(batch, mode="valid") if not self.calculate_precision: return loss @@ -804,30 +802,28 @@ def predict_step( and amino acid-level confidence scores. """ predictions = [] - # FIXME: Temporary workaround to avoid the NaN bug. - with torch.set_grad_enabled(True): - for ( - precursor_charge, - precursor_mz, - spectrum_i, - spectrum_preds, - ) in zip( - batch[1][:, 1].cpu().detach().numpy(), - batch[1][:, 2].cpu().detach().numpy(), - batch[2], - self.forward(batch[0], batch[1]), - ): - for peptide_score, aa_scores, peptide in spectrum_preds: - predictions.append( - ( - spectrum_i, - precursor_charge, - precursor_mz, - peptide, - peptide_score, - aa_scores, - ) + for ( + precursor_charge, + precursor_mz, + spectrum_i, + spectrum_preds, + ) in zip( + batch[1][:, 1].cpu().detach().numpy(), + batch[1][:, 2].cpu().detach().numpy(), + batch[2], + self.forward(batch[0], batch[1]), + ): + for peptide_score, aa_scores, peptide in spectrum_preds: + predictions.append( + ( + spectrum_i, + precursor_charge, + precursor_mz, + peptide, + peptide_score, + aa_scores, ) + ) return predictions diff --git a/pyproject.toml b/pyproject.toml index 2b836a41..551954ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ "scikit-learn", "spectrum_utils", "tensorboard", - "torch>=2.0", + "torch>=2.1", "tqdm", ] dynamic = ["version"] From 235420fb2ae77faffc8383a591675cb2989d0cbd Mon Sep 17 00:00:00 2001 From: ishagokhale <72827684+ishagokhale@users.noreply.github.com> Date: Thu, 9 Nov 2023 15:55:59 -0800 Subject: [PATCH 25/30] Issue error for unrecognized/missing config file entry (#257) * added unit tests to raise exceptions when unrecognized/missing file entry * fixed lint issue * fix lint issue * fixed failing unit test * lint issue * lint * lint issue --------- Co-authored-by: Isha Gokhale --- casanovo/config.py | 12 ++++++- tests/conftest.py | 58 ++++++++++++++++++++++++++++++++- tests/unit_tests/test_config.py | 23 +++++++------ 3 files changed, 81 insertions(+), 12 deletions(-) diff --git a/casanovo/config.py b/casanovo/config.py index 0274a1b1..c07073d6 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -83,7 +83,17 @@ def __init__(self, config_file: Optional[str] = None): else: with Path(config_file).open() as f_in: self._user_config = yaml.safe_load(f_in) - + # check for missing entries in config file + if len(self._user_config.keys()) < len(self._params.keys()): + keys_set = set(self._params.keys()) + users_set = set(self._user_config.keys()) + missing = list(keys_set - users_set) + raise KeyError(f"Missing expected entry {missing}") + # detect unrecognized config file entries + keys = list(self._params.keys()) + for key, val in self._user_config.items(): + if key not in keys: + raise KeyError(f"Unrecognized config file entry {key}") # Validate: for key, val in self._config_types.items(): self.validate_param(key, val) diff --git a/tests/conftest.py b/tests/conftest.py index 6dcda9c9..267dfa0f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -188,7 +188,7 @@ def tiny_config(tmp_path): """A config file for a tiny model.""" cfg = { "n_head": 2, - "dim_feedfoward": 10, + "dim_feedforward": 10, "n_layers": 1, "warmup_iters": 1, "max_iters": 1, @@ -196,6 +196,62 @@ def tiny_config(tmp_path): "val_check_interval": 1, "model_save_folder_path": str(tmp_path), "accelerator": "cpu", + "precursor_mass_tol": 5, + "isotope_error_range": [0, 1], + "min_peptide_len": 6, + "predict_batch_size": 1024, + "n_beams": 1, + "top_match": 1, + "devices": None, + "random_seed": 454, + "n_log": 1, + "tb_summarywriter": None, + "save_top_k": 5, + "n_peaks": 150, + "min_mz": 50.0, + "max_mz": 2500.0, + "min_intensity": 0.01, + "remove_precursor_tol": 2.0, + "max_charge": 10, + "dim_model": 512, + "dropout": 0.0, + "dim_intensity": None, + "max_length": 100, + "learning_rate": 5e-4, + "weight_decay": 1e-5, + "train_batch_size": 32, + "num_sanity_val_steps": 0, + "train_from_scratch": True, + "calculate_precision": False, + "residues": { + "G": 57.021464, + "A": 71.037114, + "S": 87.032028, + "P": 97.052764, + "V": 99.068414, + "T": 101.047670, + "C+57.021": 160.030649, + "L": 113.084064, + "I": 113.084064, + "N": 114.042927, + "D": 115.026943, + "Q": 128.058578, + "K": 128.094963, + "E": 129.042593, + "M": 131.040485, + "H": 137.058912, + "F": 147.068414, + "R": 156.101111, + "Y": 163.063329, + "W": 186.079313, + "M+15.995": 147.035400, + "N+0.984": 115.026943, + "Q+0.984": 129.042594, + "+42.011": 42.010565, + "+43.006": 43.005814, + "-17.027": -17.026549, + "+43.006-17.027": 25.980265, + }, } cfg_file = tmp_path / "config.yml" diff --git a/tests/unit_tests/test_config.py b/tests/unit_tests/test_config.py index 1e2ef338..fd8ed22e 100644 --- a/tests/unit_tests/test_config.py +++ b/tests/unit_tests/test_config.py @@ -1,5 +1,7 @@ """Test configuration loading""" from casanovo.config import Config +import pytest +import yaml def test_default(): @@ -11,7 +13,7 @@ def test_default(): assert config.file == "default" -def test_override(tmp_path): +def test_override(tmp_path, tiny_config): """Test overriding the default""" yml = tmp_path / "test.yml" with yml.open("w+") as f_out: @@ -26,12 +28,13 @@ def test_override(tmp_path): """ ) - config = Config(yml) - assert config.random_seed == 42 - assert config["random_seed"] == 42 - assert config.accelerator == "auto" - assert config.top_match == 3 - assert len(config.residues) == 4 - for i, residue in enumerate("WOUT", 1): - assert config["residues"][residue] == i - assert config.file == str(yml) + with open(tiny_config, "r") as read_file: + contents = yaml.safe_load(read_file) + contents["random_seed_"] = 354 + + with open("output.yml", "w") as write_file: + yaml.safe_dump(contents, write_file) + with pytest.raises(KeyError): + config = Config("output.yml") + with pytest.raises(KeyError): + config = Config(yml) From e073415109b71d205d0f36d0c4c32f4feca3ccea Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Mon, 11 Dec 2023 11:54:07 +0100 Subject: [PATCH 26/30] Check for invalid/missin config entries (#268) * Refactor config checking - More efficient checking for unknown and missing config entries (no needless conversions/creations of sets and lists). - Avoid creating temporary files in root directories during unit testing. - Update outdated test. * Add unit test fix --- casanovo/config.py | 25 ++++++++++--------- tests/unit_tests/test_config.py | 44 ++++++++++++++++----------------- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/casanovo/config.py b/casanovo/config.py index c07073d6..c175eef1 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -83,17 +83,20 @@ def __init__(self, config_file: Optional[str] = None): else: with Path(config_file).open() as f_in: self._user_config = yaml.safe_load(f_in) - # check for missing entries in config file - if len(self._user_config.keys()) < len(self._params.keys()): - keys_set = set(self._params.keys()) - users_set = set(self._user_config.keys()) - missing = list(keys_set - users_set) - raise KeyError(f"Missing expected entry {missing}") - # detect unrecognized config file entries - keys = list(self._params.keys()) - for key, val in self._user_config.items(): - if key not in keys: - raise KeyError(f"Unrecognized config file entry {key}") + # Check for missing entries in config file. + config_missing = self._params.keys() - self._user_config.keys() + if len(config_missing) > 0: + raise KeyError( + "Missing expected config option(s): " + f"{', '.join(config_missing)}" + ) + # Check for unrecognized config file entries. + config_unknown = self._user_config.keys() - self._params.keys() + if len(config_unknown) > 0: + raise KeyError( + "Unrecognized config option(s): " + f"{', '.join(config_unknown)}" + ) # Validate: for key, val in self._config_types.items(): self.validate_param(key, val) diff --git a/tests/unit_tests/test_config.py b/tests/unit_tests/test_config.py index fd8ed22e..7a0d7a26 100644 --- a/tests/unit_tests/test_config.py +++ b/tests/unit_tests/test_config.py @@ -1,8 +1,9 @@ """Test configuration loading""" -from casanovo.config import Config import pytest import yaml +from casanovo.config import Config + def test_default(): """Test that loading the default works""" @@ -14,27 +15,24 @@ def test_default(): def test_override(tmp_path, tiny_config): - """Test overriding the default""" - yml = tmp_path / "test.yml" - with yml.open("w+") as f_out: - f_out.write( - """random_seed: 42 -top_match: 3 -residues: - W: 1 - O: 2 - U: 3 - T: 4 -""" - ) - - with open(tiny_config, "r") as read_file: - contents = yaml.safe_load(read_file) - contents["random_seed_"] = 354 - - with open("output.yml", "w") as write_file: - yaml.safe_dump(contents, write_file) + # Test expected config option is missing. + filename = str(tmp_path / "config_missing.yml") + with open(tiny_config, "r") as f_in, open(filename, "w") as f_out: + cfg = yaml.safe_load(f_in) + # Remove config option. + del cfg["random_seed"] + yaml.safe_dump(cfg, f_out) + with pytest.raises(KeyError): - config = Config("output.yml") + Config(filename) + + # Test invalid config option is present. + filename = str(tmp_path / "config_invalid.yml") + with open(tiny_config, "r") as f_in, open(filename, "w") as f_out: + cfg = yaml.safe_load(f_in) + # Insert invalid config option. + cfg["random_seed_"] = 354 + yaml.safe_dump(cfg, f_out) + with pytest.raises(KeyError): - config = Config(yml) + Config(filename) From 3b688e8d93d1b8909ef21f82ce6240c7c02c086e Mon Sep 17 00:00:00 2001 From: Melih Yilmaz <32707537+melihyilmaz@users.noreply.github.com> Date: Tue, 12 Dec 2023 10:58:00 +0300 Subject: [PATCH 27/30] Label smoothing in training (#261) * Add option to change learning rate scheduler and made it easier to add a new one. * docs * tests and formatting * Add label smoothing * Modify config file * Minor fix config.yaml * Run black * Lint casanovo.py * Revert "Merge branch 'add_lr_schedule_options' into label-smoothing" This reverts commit 5716c7abc10be1cc8e517f3ca564617f4b39728a, reversing changes made to b044bc64b36d878522c8080547d77b4df9f45955. * Add unit test * Fix config test and add changelog --------- Co-authored-by: Justin Sanders --- CHANGELOG.md | 1 + casanovo/config.py | 1 + casanovo/config.yaml | 2 ++ casanovo/denovo/model.py | 13 +++++++++++-- casanovo/denovo/model_runner.py | 1 + tests/conftest.py | 1 + tests/unit_tests/test_unit.py | 19 ++++++++++++++++--- 7 files changed, 33 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 08b77443..306514ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - `accelerator` parameter controls the accelerator (CPU, GPU, etc) that is used. - `devices` parameter controls the number of accelerators used. - `val_check_interval` parameter controls the frequency of both validation epochs and model checkpointing during training. +- `train_label_smoothing` parameter controls the amount of label smoothing applied when calculating the training loss. ### Changed diff --git a/casanovo/config.py b/casanovo/config.py index c175eef1..0b5a1e4d 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -53,6 +53,7 @@ class Config: residues=dict, n_log=int, tb_summarywriter=str, + train_label_smoothing=float, warmup_iters=int, max_iters=int, learning_rate=float, diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 729e827d..896f67bc 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -89,6 +89,8 @@ max_iters: 600_000 learning_rate: 5e-4 # Regularization term for weight updates weight_decay: 1e-5 +# Amount of label smoothing when computing the training loss +train_label_smoothing: 0.01 # TRAINING/INFERENCE OPTIONS # Number of spectra in one training batch diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 26766ead..39d2027a 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -73,6 +73,8 @@ class Spec2Pep(pl.LightningModule, ModelMixin): tb_summarywriter: Optional[str] Folder path to record performance metrics during training. If ``None``, don't use a ``SummaryWriter``. + train_label_smoothing: float + Smoothing factor when calculating the training loss. warmup_iters: int The number of warm up iterations for the learning rate scheduler. max_iters: int @@ -106,6 +108,7 @@ def __init__( tb_summarywriter: Optional[ torch.utils.tensorboard.SummaryWriter ] = None, + train_label_smoothing: float = 0.01, warmup_iters: int = 100_000, max_iters: int = 600_000, out_writer: Optional[ms_io.MztabWriter] = None, @@ -134,7 +137,10 @@ def __init__( max_charge=max_charge, ) self.softmax = torch.nn.Softmax(2) - self.celoss = torch.nn.CrossEntropyLoss(ignore_index=0) + self.celoss = torch.nn.CrossEntropyLoss( + ignore_index=0, label_smoothing=train_label_smoothing + ) + self.val_celoss = torch.nn.CrossEntropyLoss(ignore_index=0) # Optimizer settings. self.warmup_iters = warmup_iters self.max_iters = max_iters @@ -723,7 +729,10 @@ def training_step( """ pred, truth = self._forward_step(*batch) pred = pred[:, :-1, :].reshape(-1, self.decoder.vocab_size + 1) - loss = self.celoss(pred, truth.flatten()) + if mode == "train": + loss = self.celoss(pred, truth.flatten()) + else: + loss = self.val_celoss(pred, truth.flatten()) self.log( f"{mode}_CELoss", loss.detach(), diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 852860d3..b047525c 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -221,6 +221,7 @@ def initialize_model(self, train: bool) -> None: top_match=self.config.top_match, n_log=self.config.n_log, tb_summarywriter=self.config.tb_summarywriter, + train_label_smoothing=self.config.train_label_smoothing, warmup_iters=self.config.warmup_iters, max_iters=self.config.max_iters, lr=self.config.learning_rate, diff --git a/tests/conftest.py b/tests/conftest.py index 267dfa0f..a690bd8a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -190,6 +190,7 @@ def tiny_config(tmp_path): "n_head": 2, "dim_feedforward": 10, "n_layers": 1, + "train_label_smoothing": 0.01, "warmup_iters": 1, "max_iters": 1, "max_epochs": 20, diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index efa89a05..6b840d20 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -514,15 +514,28 @@ def test_spectrum_id_mzml(mzml_small, tmp_path): def test_train_val_step_functions(): """Test train and validation step functions operating on batches.""" - model = Spec2Pep(n_beams=1, residues="massivekb", min_peptide_len=4) + model = Spec2Pep( + n_beams=1, + residues="massivekb", + min_peptide_len=4, + train_label_smoothing=0.1, + ) spectra = torch.zeros(1, 5, 2) precursors = torch.tensor([[469.25364, 2.0, 235.63410]]) peptides = ["PEPK"] batch = (spectra, precursors, peptides) + train_step_loss = model.training_step(batch) + val_step_loss = model.validation_step(batch) + # Check if valid loss value returned - assert model.training_step(batch) > 0 - assert model.validation_step(batch) > 0 + assert train_step_loss > 0 + assert val_step_loss > 0 + + # Check if smoothing is applied in training and not in validation + assert model.celoss.label_smoothing == 0.1 + assert model.val_celoss.label_smoothing == 0 + assert val_step_loss != train_step_loss def test_run_map(mgf_small): From 2aed9e5355a49087efd58dfd69c40e35fc252c78 Mon Sep 17 00:00:00 2001 From: Melih Yilmaz <32707537+melihyilmaz@users.noreply.github.com> Date: Tue, 12 Dec 2023 11:31:53 +0300 Subject: [PATCH 28/30] Use config options and auto-downloaded weights (#246) * Use auto-downloaded weights * Use config options in model init * Fix linting * Fix missing return value * Override mismatching params with ckpt * Handle corrupt ckpt * Add test case for parameter mismatch * Add Python version to screenshot action * Generate new screengrabs with rich-codex * Fix import order * Minor reformatting --------- Co-authored-by: William Fondrie Co-authored-by: github-actions[bot] Co-authored-by: Wout Bittremieux --- .github/workflows/screenshots.yml | 10 +- casanovo/casanovo.py | 8 +- casanovo/denovo/model_runner.py | 56 +++++++-- docs/images/configure-help.svg | 69 +++++------ docs/images/evaluate-help.svg | 131 ++++++++++----------- docs/images/help.svg | 163 +++++++++++++------------- docs/images/sequence-help.svg | 131 ++++++++++----------- docs/images/train-help.svg | 183 +++++++++++++++--------------- tests/unit_tests/test_runner.py | 4 + 9 files changed, 403 insertions(+), 352 deletions(-) diff --git a/.github/workflows/screenshots.yml b/.github/workflows/screenshots.yml index 3b646efa..a9bcf896 100644 --- a/.github/workflows/screenshots.yml +++ b/.github/workflows/screenshots.yml @@ -11,15 +11,19 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out the repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: ref: ${{ github.head_ref }} - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v4 + with: + python-version: "3.10" - name: Install your custom tools - run: pip install . + run: | + python -m pip install --upgrade pip + pip install . - name: Generate terminal images with rich-codex uses: ewels/rich-codex@v1 diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 992cf566..0a1c3618 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -133,7 +133,7 @@ def sequence( to sequence peptides. """ output = setup_logging(output, verbosity) - config = setup_model(model, config, output, False) + config, model = setup_model(model, config, output, False) with ModelRunner(config, model) as runner: logger.info("Sequencing peptides from:") for peak_file in peak_path: @@ -164,7 +164,7 @@ def evaluate( such as those provided by MassIVE-KB. """ output = setup_logging(output, verbosity) - config = setup_model(model, config, output, False) + config, model = setup_model(model, config, output, False) with ModelRunner(config, model) as runner: logger.info("Sequencing and evaluating peptides from:") for peak_file in annotated_peak_path: @@ -207,7 +207,7 @@ def train( provided by MassIVE-KB, from which to train a new Casnovo model. """ output = setup_logging(output, verbosity) - config = setup_model(model, config, output, True) + config, model = setup_model(model, config, output, True) with ModelRunner(config, model) as runner: logger.info("Training a model from:") for peak_file in train_peak_path: @@ -378,7 +378,7 @@ def setup_model( for key, value in config.items(): logger.debug("%s = %s", str(key), str(value)) - return config + return config, model def _get_model_weights() -> str: diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index b047525c..697442d1 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -5,6 +5,7 @@ import os import tempfile import uuid +import warnings from pathlib import Path from typing import Iterable, List, Optional, Union @@ -217,6 +218,7 @@ def initialize_model(self, train: bool) -> None: max_charge=self.config.max_charge, precursor_mass_tol=self.config.precursor_mass_tol, isotope_error_range=self.config.isotope_error_range, + min_peptide_len=self.config.min_peptide_len, n_beams=self.config.n_beams, top_match=self.config.top_match, n_log=self.config.n_log, @@ -230,6 +232,24 @@ def initialize_model(self, train: bool) -> None: calculate_precision=self.config.calculate_precision, ) + # Reconfigurable non-architecture related parameters for a loaded model + loaded_model_params = dict( + max_length=self.config.max_length, + precursor_mass_tol=self.config.precursor_mass_tol, + isotope_error_range=self.config.isotope_error_range, + n_beams=self.config.n_beams, + min_peptide_len=self.config.min_peptide_len, + top_match=self.config.top_match, + n_log=self.config.n_log, + tb_summarywriter=self.config.tb_summarywriter, + warmup_iters=self.config.warmup_iters, + max_iters=self.config.max_iters, + lr=self.config.learning_rate, + weight_decay=self.config.weight_decay, + out_writer=self.writer, + calculate_precision=self.config.calculate_precision, + ) + from_scratch = ( self.config.train_from_scratch, self.model_filename is None, @@ -248,20 +268,38 @@ def initialize_model(self, train: bool) -> None: ) raise FileNotFoundError("Could not find the model weights file") - # First try loading model details from the weithgs file, - # otherwise use the provided configuration. + # First try loading model details from the weights file, otherwise use + # the provided configuration. device = torch.empty(1).device # Use the default device. try: self.model = Spec2Pep.load_from_checkpoint( - self.model_filename, - map_location=device, + self.model_filename, map_location=device, **loaded_model_params ) - except RuntimeError: - self.model = Spec2Pep.load_from_checkpoint( - self.model_filename, - map_location=device, - **model_params, + + architecture_params = set(model_params.keys()) - set( + loaded_model_params.keys() ) + for param in architecture_params: + if model_params[param] != self.model.hparams[param]: + warnings.warn( + f"Mismatching {param} parameter in " + f"model checkpoint ({self.model.hparams[param]}) " + f"vs config file ({model_params[param]}); " + "using the checkpoint." + ) + except RuntimeError: + # This only doesn't work if the weights are from an older version + try: + self.model = Spec2Pep.load_from_checkpoint( + self.model_filename, + map_location=device, + **model_params, + ) + except RuntimeError: + raise RuntimeError( + "Weights file incompatible with the current version of " + "Casanovo. " + ) def initialize_data_module( self, diff --git a/docs/images/configure-help.svg b/docs/images/configure-help.svg index fc4e6305..d5dd7aa8 100644 --- a/docs/images/configure-help.svg +++ b/docs/images/configure-help.svg @@ -19,62 +19,63 @@ font-weight: 700; } - .terminal-241098124-matrix { + .terminal-2285289330-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-241098124-title { + .terminal-2285289330-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-241098124-r1 { fill: #c5c8c6 } -.terminal-241098124-r2 { fill: #c5c8c6;font-weight: bold } -.terminal-241098124-r3 { fill: #d0b344;font-weight: bold } -.terminal-241098124-r4 { fill: #68a0b3;font-weight: bold } -.terminal-241098124-r5 { fill: #868887 } -.terminal-241098124-r6 { fill: #98a84b;font-weight: bold } + .terminal-2285289330-r1 { fill: #c5c8c6 } +.terminal-2285289330-r2 { fill: #d0b344 } +.terminal-2285289330-r3 { fill: #c5c8c6;font-weight: bold } +.terminal-2285289330-r4 { fill: #68a0b3;font-weight: bold } +.terminal-2285289330-r5 { fill: #868887 } +.terminal-2285289330-r6 { fill: #98a84b;font-weight: bold } +.terminal-2285289330-r7 { fill: #d0b344;font-weight: bold } - + - + - + - + - + - + - + - + - + - + - + - + @@ -86,21 +87,21 @@ - + - - $ casanovo configure --help - -Usage: casanovo configure [OPTIONS] - - Generate a Casanovo configuration file to customize.                            - The casanovo configuration file is in the YAML format.                          - -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---output-oFILE  The output configuration file.                            ---help-h  Show this message and exit.                               -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo configure --help + +Usage:casanovo configure [OPTIONS]                                             + + Generate a Casanovo configuration file to customize.                            + The casanovo configuration file is in the YAML format.                          + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--output-oFILE  The output configuration file.                            +--help-h  Show this message and exit.                               +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/evaluate-help.svg b/docs/images/evaluate-help.svg index d4832c98..e220664b 100644 --- a/docs/images/evaluate-help.svg +++ b/docs/images/evaluate-help.svg @@ -19,107 +19,108 @@ font-weight: 700; } - .terminal-1106939395-matrix { + .terminal-1788431117-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-1106939395-title { + .terminal-1788431117-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-1106939395-r1 { fill: #c5c8c6 } -.terminal-1106939395-r2 { fill: #c5c8c6;font-weight: bold } -.terminal-1106939395-r3 { fill: #d0b344;font-weight: bold } -.terminal-1106939395-r4 { fill: #68a0b3;font-weight: bold } -.terminal-1106939395-r5 { fill: #868887 } -.terminal-1106939395-r6 { fill: #cc555a } -.terminal-1106939395-r7 { fill: #8a4346 } -.terminal-1106939395-r8 { fill: #98a84b;font-weight: bold } -.terminal-1106939395-r9 { fill: #8d7b39;font-weight: bold } + .terminal-1788431117-r1 { fill: #c5c8c6 } +.terminal-1788431117-r2 { fill: #d0b344 } +.terminal-1788431117-r3 { fill: #c5c8c6;font-weight: bold } +.terminal-1788431117-r4 { fill: #68a0b3;font-weight: bold } +.terminal-1788431117-r5 { fill: #868887 } +.terminal-1788431117-r6 { fill: #cc555a } +.terminal-1788431117-r7 { fill: #d0b344;font-weight: bold } +.terminal-1788431117-r8 { fill: #8a4346 } +.terminal-1788431117-r9 { fill: #98a84b;font-weight: bold } +.terminal-1788431117-r10 { fill: #8d7b39;font-weight: bold } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -131,35 +132,35 @@ - + - - $ casanovo evaluate --help - -Usage: casanovo evaluate [OPTIONSANNOTATED_PEAK_PATH... - - Evaluate de novo peptide sequencing performance.                                - ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       - provided by MassIVE-KB.                                                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*ANNOTATED_PEAK_PATHFILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---model-mFILE                        The model weights (.ckpt file).  -                                              If not provided, Casanovo will   -                                              try to download the latest       -                                              release.                         ---output-oFILE                        The mzTab file to which results  -                                              will be written.                 ---config-cFILE                        The YAML configuration file      -                                              overriding the default options.  ---verbosity-v[debug|info|warning|error]  Set the verbosity of console     -                                              logging messages. Log files are  -                                              always set to 'debug'.           ---help-h  Show this message and exit.      -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo evaluate --help + +Usage:casanovo evaluate [OPTIONSANNOTATED_PEAK_PATH...                       + + Evaluate de novo peptide sequencing performance.                                + ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       + provided by MassIVE-KB.                                                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  ANNOTATED_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--model-mFILE                        The model weights (.ckpt file).  +                                              If not provided, Casanovo will   +                                              try to download the latest       +                                              release.                         +--output-oFILE                        The mzTab file to which results  +                                              will be written.                 +--config-cFILE                        The YAML configuration file      +                                              overriding the default options.  +--verbosity-v[debug|info|warning|error]  Set the verbosity of console     +                                              logging messages. Log files are  +                                              always set to 'debug'.           +--help-h  Show this message and exit.      +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/help.svg b/docs/images/help.svg index 72d06a85..42180a3f 100644 --- a/docs/images/help.svg +++ b/docs/images/help.svg @@ -19,132 +19,133 @@ font-weight: 700; } - .terminal-2334127528-matrix { + .terminal-2243088900-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-2334127528-title { + .terminal-2243088900-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-2334127528-r1 { fill: #c5c8c6 } -.terminal-2334127528-r2 { fill: #c5c8c6;font-weight: bold } -.terminal-2334127528-r3 { fill: #d0b344;font-weight: bold } -.terminal-2334127528-r4 { fill: #68a0b3;font-weight: bold } -.terminal-2334127528-r5 { fill: #1984e9 } -.terminal-2334127528-r6 { fill: #868887 } -.terminal-2334127528-r7 { fill: #98a84b;font-weight: bold } + .terminal-2243088900-r1 { fill: #c5c8c6 } +.terminal-2243088900-r2 { fill: #d0b344 } +.terminal-2243088900-r3 { fill: #c5c8c6;font-weight: bold } +.terminal-2243088900-r4 { fill: #68a0b3;font-weight: bold } +.terminal-2243088900-r5 { fill: #d0b344;font-weight: bold } +.terminal-2243088900-r6 { fill: #608ab1;text-decoration: underline; } +.terminal-2243088900-r7 { fill: #868887 } +.terminal-2243088900-r8 { fill: #98a84b;font-weight: bold } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -156,44 +157,44 @@ - + - - $ casanovo --help - -Usage: casanovo [OPTIONSCOMMAND [ARGS]... - - ╔════════════════════════════════════════════════════════════════════════════╗  - ║                                  Casanovo                                  ║  - ╚════════════════════════════════════════════════════════════════════════════╝  - Casanovo de novo sequences peptides from tandem mass spectra using a            - Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   - de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  - training new models.                                                            - - Links:                                                                          - - • Documentation: https://casanovo.readthedocs.io - • Official code repository: https://github.com/Noble-Lab/casanovo - - If you use Casanovo in your work, please cite:                                  - - • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   -mass spectrometry peptide sequencing with a transformer model. Proceedings   -of the 39th International Conference on Machine Learning - ICML '22 (2022)   -doi:10.1101/2022.02.07.479481.                                               - -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---help-h    Show this message and exit.                                     -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Commands ───────────────────────────────────────────────────────────────────╮ -configure Generate a Casanovo configuration file to customize.               -evaluate  Evaluate de novo peptide sequencing performance.                   -sequence  De novo sequence peptides from tandem mass spectra.                -train     Train a Casanovo model on your own data.                           -version   Get the Casanovo version information                               -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo --help + +Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     + + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  + ┃                                  Casanovo                                  ┃  + ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  + Casanovo de novo sequences peptides from tandem mass spectra using a            + Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   + de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  + training new models.                                                            + + Links:                                                                          + + • Documentation: https://casanovo.readthedocs.io + • Official code repository: https://github.com/Noble-Lab/casanovo + + If you use Casanovo in your work, please cite:                                  + + • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   +mass spectrometry peptide sequencing with a transformer model. Proceedings   +of the 39th International Conference on Machine Learning - ICML '22 (2022)   +doi:10.1101/2022.02.07.479481.                                               + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--help-h    Show this message and exit.                                     +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Commands ───────────────────────────────────────────────────────────────────╮ +configure Generate a Casanovo configuration file to customize.               +evaluate  Evaluate de novo peptide sequencing performance.                   +sequence  De novo sequence peptides from tandem mass spectra.                +train     Train a Casanovo model on your own data.                           +version   Get the Casanovo version information                               +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg index 01b002fd..d493e2b2 100644 --- a/docs/images/sequence-help.svg +++ b/docs/images/sequence-help.svg @@ -19,107 +19,108 @@ font-weight: 700; } - .terminal-485984700-matrix { + .terminal-2396407494-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-485984700-title { + .terminal-2396407494-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-485984700-r1 { fill: #c5c8c6 } -.terminal-485984700-r2 { fill: #c5c8c6;font-weight: bold } -.terminal-485984700-r3 { fill: #d0b344;font-weight: bold } -.terminal-485984700-r4 { fill: #68a0b3;font-weight: bold } -.terminal-485984700-r5 { fill: #868887 } -.terminal-485984700-r6 { fill: #cc555a } -.terminal-485984700-r7 { fill: #8a4346 } -.terminal-485984700-r8 { fill: #98a84b;font-weight: bold } -.terminal-485984700-r9 { fill: #8d7b39;font-weight: bold } + .terminal-2396407494-r1 { fill: #c5c8c6 } +.terminal-2396407494-r2 { fill: #d0b344 } +.terminal-2396407494-r3 { fill: #c5c8c6;font-weight: bold } +.terminal-2396407494-r4 { fill: #68a0b3;font-weight: bold } +.terminal-2396407494-r5 { fill: #868887 } +.terminal-2396407494-r6 { fill: #cc555a } +.terminal-2396407494-r7 { fill: #d0b344;font-weight: bold } +.terminal-2396407494-r8 { fill: #8a4346 } +.terminal-2396407494-r9 { fill: #98a84b;font-weight: bold } +.terminal-2396407494-r10 { fill: #8d7b39;font-weight: bold } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -131,35 +132,35 @@ - + - - $ casanovo sequence --help - -Usage: casanovo sequence [OPTIONSPEAK_PATH... - - De novo sequence peptides from tandem mass spectra.                             - PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence  - peptides.                                                                       - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*PEAK_PATHFILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---model-mFILE                        The model weights (.ckpt file).  -                                              If not provided, Casanovo will   -                                              try to download the latest       -                                              release.                         ---output-oFILE                        The mzTab file to which results  -                                              will be written.                 ---config-cFILE                        The YAML configuration file      -                                              overriding the default options.  ---verbosity-v[debug|info|warning|error]  Set the verbosity of console     -                                              logging messages. Log files are  -                                              always set to 'debug'.           ---help-h  Show this message and exit.      -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo sequence --help + +Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 + + De novo sequence peptides from tandem mass spectra.                             + PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence  + peptides.                                                                       + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--model-mFILE                        The model weights (.ckpt file).  +                                              If not provided, Casanovo will   +                                              try to download the latest       +                                              release.                         +--output-oFILE                        The mzTab file to which results  +                                              will be written.                 +--config-cFILE                        The YAML configuration file      +                                              overriding the default options.  +--verbosity-v[debug|info|warning|error]  Set the verbosity of console     +                                              logging messages. Log files are  +                                              always set to 'debug'.           +--help-h  Show this message and exit.      +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/train-help.svg b/docs/images/train-help.svg index e70940ed..82c30122 100644 --- a/docs/images/train-help.svg +++ b/docs/images/train-help.svg @@ -19,146 +19,147 @@ font-weight: 700; } - .terminal-889313671-matrix { + .terminal-3340932753-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-889313671-title { + .terminal-3340932753-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-889313671-r1 { fill: #c5c8c6 } -.terminal-889313671-r2 { fill: #c5c8c6;font-weight: bold } -.terminal-889313671-r3 { fill: #d0b344;font-weight: bold } -.terminal-889313671-r4 { fill: #68a0b3;font-weight: bold } -.terminal-889313671-r5 { fill: #868887 } -.terminal-889313671-r6 { fill: #cc555a } -.terminal-889313671-r7 { fill: #8a4346 } -.terminal-889313671-r8 { fill: #98a84b;font-weight: bold } -.terminal-889313671-r9 { fill: #8d7b39;font-weight: bold } + .terminal-3340932753-r1 { fill: #c5c8c6 } +.terminal-3340932753-r2 { fill: #d0b344 } +.terminal-3340932753-r3 { fill: #c5c8c6;font-weight: bold } +.terminal-3340932753-r4 { fill: #68a0b3;font-weight: bold } +.terminal-3340932753-r5 { fill: #868887 } +.terminal-3340932753-r6 { fill: #cc555a } +.terminal-3340932753-r7 { fill: #d0b344;font-weight: bold } +.terminal-3340932753-r8 { fill: #8a4346 } +.terminal-3340932753-r9 { fill: #98a84b;font-weight: bold } +.terminal-3340932753-r10 { fill: #8d7b39;font-weight: bold } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -170,48 +171,48 @@ - + - - $ casanovo train --help - -Usage: casanovo train [OPTIONSTRAIN_PEAK_PATH... - - Train a Casanovo model on your own data.                                        - TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  - by MassIVE-KB, from which to train a new Casnovo model.                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*TRAIN_PEAK_PATHFILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -*--validation_peak_pa…-pFILE                    An annotated MGF file   -                                                       for validation, like    -                                                       from MassIVE-KB. Use    -                                                       this option multiple    -                                                       times to specify        -                                                       multiple files.         -[required]             ---model-mFILE                    The model weights       -                                                       (.ckpt file). If not    -                                                       provided, Casanovo      -                                                       will try to download    -                                                       the latest release.     ---output-oFILE                    The mzTab file to       -                                                       which results will be   -                                                       written.                ---config-cFILE                    The YAML configuration  -                                                       file overriding the     -                                                       default options.        ---verbosity-v[debug|info|warning|er  Set the verbosity of    -ror]  console logging         -                                                       messages. Log files     -                                                       are always set to       -                                                       'debug'.                ---help-h  Show this message and   -                                                       exit.                   -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo train --help + +Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              + + Train a Casanovo model on your own data.                                        + TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  + by MassIVE-KB, from which to train a new Casnovo model.                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  TRAIN_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +*--validation_peak_pa…-pFILE                    An annotated MGF file   +                                                       for validation, like    +                                                       from MassIVE-KB. Use    +                                                       this option multiple    +                                                       times to specify        +                                                       multiple files.         +[required]             +--model-mFILE                    The model weights       +                                                       (.ckpt file). If not    +                                                       provided, Casanovo      +                                                       will try to download    +                                                       the latest release.     +--output-oFILE                    The mzTab file to       +                                                       which results will be   +                                                       written.                +--config-cFILE                    The YAML configuration  +                                                       file overriding the     +                                                       default options.        +--verbosity-v[debug|info|warning|er  Set the verbosity of    +ror]  console logging         +                                                       messages. Log files     +                                                       are always set to       +                                                       'debug'.                +--help-h  Show this message and   +                                                       exit.                   +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index 663e3b3b..6be91831 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -56,6 +56,8 @@ def test_save_and_load_weights(tmp_path, mgf_small, tiny_config): # Try changing model arch: other_config = Config(tiny_config) other_config.n_layers = 50 # lol + other_config.n_beams = 12 + other_config.max_iters = 2 with torch.device("meta"): # Now load the weights into a new model # The device should be meta for all the weights. @@ -64,6 +66,8 @@ def test_save_and_load_weights(tmp_path, mgf_small, tiny_config): obs_layers = runner.model.encoder.transformer_encoder.num_layers assert obs_layers == 1 # Match the original arch. + assert runner.model.n_beams == 12 # Match the config + assert runner.model.max_iters == 2 # Match the config assert next(runner.model.parameters()).device == torch.device("meta") # If the Trainer correctly moves the weights to the accelerator, From ad9ba0f710abf61b0d7c181efea0bf1e66d6910b Mon Sep 17 00:00:00 2001 From: Melih Yilmaz <32707537+melihyilmaz@users.noreply.github.com> Date: Wed, 13 Dec 2023 11:08:06 +0300 Subject: [PATCH 29/30] Minor fix to label smoothing (#270) Add option to change label smoothing for model initialization from a checkpoint file. --- casanovo/denovo/model_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 697442d1..c7a9cab6 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -242,6 +242,7 @@ def initialize_model(self, train: bool) -> None: top_match=self.config.top_match, n_log=self.config.n_log, tb_summarywriter=self.config.tb_summarywriter, + train_label_smoothing=self.config.train_label_smoothing, warmup_iters=self.config.warmup_iters, max_iters=self.config.max_iters, lr=self.config.learning_rate, From c862bb5b42bb2ff93452c3cdcf418f7f8f509c04 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Fri, 22 Dec 2023 13:44:08 +0100 Subject: [PATCH 30/30] Update version number in changelog --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e5a74b68..bbc9284e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] +## [4.0.0] - 2023-12-22 + ### Added - Checkpoints include model parameters, allowing for mismatches with the provided configuration file. @@ -215,7 +217,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Initial Casanovo version. -[Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v3.5.0...HEAD +[Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v4.0.0...HEAD +[4.0.0]: https://github.com/Noble-Lab/casanovo/compare/v3.5.0...v4.0.0 [3.5.0]: https://github.com/Noble-Lab/casanovo/compare/v3.4.0...v3.5.0 [3.4.0]: https://github.com/Noble-Lab/casanovo/compare/v3.3.0...v3.4.0 [3.3.0]: https://github.com/Noble-Lab/casanovo/compare/v3.2.0...v3.3.0