diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index fb937494..2d7fa400 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -14,9 +14,9 @@ jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Setup Python 3.10 - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: "3.10" diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index dea50c18..e64608c5 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -12,11 +12,11 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: - python-version: '3.x' + python-version: "3.x" - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/screenshots.yml b/.github/workflows/screenshots.yml index a9bcf896..7fb04c0d 100644 --- a/.github/workflows/screenshots.yml +++ b/.github/workflows/screenshots.yml @@ -16,7 +16,7 @@ jobs: ref: ${{ github.head_ref }} - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.10" diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 08001ed5..53483060 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -21,9 +21,9 @@ jobs: os: [ubuntu-latest, windows-latest, macos-latest] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python 3.10 - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: "3.10" diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 0a1c3618..8bdfa58f 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -1,4 +1,5 @@ """The command line entry point for Casanovo.""" + import datetime import functools import logging diff --git a/casanovo/config.py b/casanovo/config.py index 22924018..817766ac 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -1,4 +1,5 @@ """Parse the YAML configuration.""" + import logging import shutil from pathlib import Path diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py index 23b3d8e3..6244e88f 100644 --- a/casanovo/data/datasets.py +++ b/casanovo/data/datasets.py @@ -1,4 +1,5 @@ """A PyTorch Dataset class for annotated spectra.""" + from typing import Optional, Tuple import depthcharge diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index 47d99700..7be6ea8c 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -1,4 +1,5 @@ """Mass spectrometry file type input/output operations.""" + import collections import csv import operator diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 998fa66a..fe5d6237 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -1,4 +1,5 @@ """Data loaders for the de novo sequencing task.""" + import functools import os from typing import List, Optional, Tuple diff --git a/casanovo/denovo/evaluate.py b/casanovo/denovo/evaluate.py index 75ac4b6a..cbf9e74f 100644 --- a/casanovo/denovo/evaluate.py +++ b/casanovo/denovo/evaluate.py @@ -1,4 +1,5 @@ """Methods to evaluate peptide-spectrum predictions.""" + import re from typing import Dict, Iterable, List, Tuple diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 39d2027a..b1d51e9c 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1,4 +1,5 @@ """A de novo peptide sequencing model.""" + import collections import heapq import logging diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index b632227b..3253419a 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -1,5 +1,6 @@ """Training and testing functionality for the de novo peptide sequencing model.""" + import glob import logging import os @@ -306,9 +307,9 @@ def initialize_data_module( self, train_index: Optional[AnnotatedSpectrumIndex] = None, valid_index: Optional[AnnotatedSpectrumIndex] = None, - test_index: ( - Optional[Union[AnnotatedSpectrumIndex, SpectrumIndex]] - ) = None, + test_index: Optional[ + Union[AnnotatedSpectrumIndex, SpectrumIndex] + ] = None, ) -> None: """Initialize the data module diff --git a/casanovo/utils.py b/casanovo/utils.py index b497ac12..4125cd54 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -1,4 +1,5 @@ -"""Small utility functions""" +"""Small utility functions.""" + import logging import os import platform diff --git a/casanovo/version.py b/casanovo/version.py index d1b7f64e..579db300 100644 --- a/casanovo/version.py +++ b/casanovo/version.py @@ -1,4 +1,5 @@ """Package version information.""" + from typing import Optional diff --git a/docs/faq.md b/docs/faq.md index 15103cac..a3103601 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -1,5 +1,7 @@ # Frequently Asked Questions +## Running Casanovo + **I installed Casanovo and it worked before, but I after reopening Anaconda it says that Casanovo is not installed.** Make sure you are in the `casanovo_env` environment. You can ensure this by typing: @@ -27,6 +29,8 @@ However, the GitHub API is limited to maximum 60 requests per hour per IP addres Consequently, if Casanovo has been executed multiple times already, it might temporarily not be able to communicate with GitHub. You can avoid this error by explicitly specifying the model file using the `--model` parameter. +## GPU Troubleshooting + **Casanovo is very slow even when running on the GPU. How can I speed it up?** It is highly recommended to run Casanovo on the GPU to get the maximum performance. @@ -52,6 +56,22 @@ This means that there was not enough (free) memory available on your GPU to run We recommend trying to decrease the `train_batch_size` or `predict_batch_size` options in the [config file](https://github.com/Noble-Lab/casanovo/blob/main/casanovo/config.yaml) (depending on whether the error occurred during `train` or `denovo` mode) to reduce the number of spectra that are processed simultaneously. Additionally, we recommend shutting down any other processes that may be running on the GPU, so that Casanovo can exclusively use the GPU. +**How can I run Casanovo on a specific GPU device?** + +You can control which GPU(s) Casanovo uses by setting the `devices` option in the [configuration file](https://github.com/Noble-Lab/casanovo/blob/main/casanovo/config.yaml). +Analogously, this setting also controls the number of cores to use when running on a CPU only (which can be specified using the `accelerator` option). + +By default, Casanovo will automatically try to use the maximum number of devices available. +I.e., if your system has multiple GPUs, Casanovo will utilize all of those for maximum efficiency. +Alternatively, you can select a specific GPU by specifying the GPU number as the value for `devices`. +For example, if you have a four-GPU system, when specifying `devices: 1` in your config file Casanovo will only use the GPU with identifier `1`. + +The config file functionality only allows specifying a single GPU, by setting its id under `devices`, or all GPUs, by setting `devices: -1`. +If you want more fine-grained control to use some but not all GPUs on a multi-GPU system, the `CUDA_VISIBLE_DEVICES` environment variable can be used instead. +For example, by setting `CUDA_VISIBLE_DEVICES=1,3`, only GPUs `1` and `3` will be visible to Casanovo, and specifying `devices: -1` will allow it to utilize both of these. + +Note that when using `CUDA_VISIBLE_DEVICES`, the GPU numbers (potentially to be specified under `devices`) are reset to consecutively increase from `0`. + **I see "NotImplementedError: The operator 'aten::index.Tensor'..." when using a Mac with an Apple Silicon chip.** Casanovo can leverage Apple's Metal Performance Shaders (MPS) on newer Mac computers, which requires that the `PYTORCH_ENABLE_MPS_FALLBACK` is set to `1`: @@ -62,9 +82,11 @@ export PYTORCH_ENABLE_MPS_FALLBACK=1 This will need to be set with each new shell session, or you can add it to your `.bashrc` / `.zshrc` to set this environment variable by default. +## Training Casanovo + **Where can I find the data that Casanovo was trained on?** -The [Casanovo results reported ](https://doi.org/10.1101/2023.01.03.522621) were obtained by training on two different datasets: (i) a commonly used nine-species benchmark dataset, and (ii) a large-scale training dataset derived from the MassIVE Knowledge Base (MassIVE-KB). +The [Casanovo results reported](https://doi.org/10.1101/2023.01.03.522621) were obtained by training on two different datasets: (i) a commonly used nine-species benchmark dataset, and (ii) a large-scale training dataset derived from the MassIVE Knowledge Base (MassIVE-KB). All data for the _nine-species benchmark_ is available as annotated MGF files [on MassIVE](https://doi.org/doi:10.25345/C52V2CK8J). Using these data, Casanovo was trained in a cross-validated fashion, training on eight species and testing on the remaining species. @@ -75,6 +97,9 @@ To compile this dataset yourself, on the [MassIVE website](https://massive.ucsd. This will give you a zipped TSV file with the metadata and peptide identifications for all 30 million PSMs. Using the filename (column "filename") you can then retrieve the corresponding peak files from the MassIVE FTP server and extract the desired spectra using their scan number (column "scan"). +The _non-enzymatic dataset_, used to train a non-tryptic version of Casanovo, was created by selecting PSMs with a uniform distribution of amino acids at the C-terminal peptide positions from two datasets: MassIVE-KB and PROSPECT. +Training, validation, and test splits for the non-enzymatic dataset are available as annotated MGF files [on MassIVE](https://doi.org/doi:10.25345/C5KS6JG0W). + **How do I know which model to use after training Casanovo?** By default, Casanovo saves a snapshot of the model weights after every 50,000 training steps. @@ -107,6 +132,19 @@ To include new PTMs in Casanovo, you need to: It is unfortunately not possible to finetune a pre-trained Casanovo model to add new types of PTMs. Instead, such a model must be trained from scratch. +**How can I change the learning rate schedule used during training?** + +By default, Casanovo uses a learning rate schedule that combines linear warm up followed by a cosine wave shaped decay (as implemented in `CosineWarmupScheduler` in `casanovo/denovo/model.py`) during training. +To use a different learning rate schedule, you can specify an alternative learning rate scheduler as follows (in the `lr_scheduler` variable in function `Spec2Pep.configure_optimizers` in `casanovo/denovo/model.py`): + +``` +lr_scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, total_iters=self.warmup_iters) +``` + +You can use any of the scheduler classes available in [`torch.optim.lr_scheduler`](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) or implement your custom learning rate schedule similar to `CosineWarmupScheduler`. + +## Miscellaneous + **How can I generate a precision–coverage curve?** You can evaluate a trained Casanovo model compared to ground-truth peptide labels using a precision–coverage curve. diff --git a/tests/conftest.py b/tests/conftest.py index f1918300..3345824e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ """Fixtures used for testing.""" + import numpy as np import psims import pytest diff --git a/tests/unit_tests/test_config.py b/tests/unit_tests/test_config.py index 7a0d7a26..89d32569 100644 --- a/tests/unit_tests/test_config.py +++ b/tests/unit_tests/test_config.py @@ -1,4 +1,5 @@ -"""Test configuration loading""" +"""Test configuration loading.""" + import pytest import yaml diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index a670acad..efaceb6b 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -1,4 +1,5 @@ """Unit tests specifically for the model_runner module.""" + import pytest import torch