From 258edb40368fb94f0aeb9a67bb7590b5c8fc31cb Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Wed, 27 Mar 2024 16:44:25 -0700 Subject: [PATCH 01/84] begin adding tests for annotate mode --- casanovo/casanovo.py | 46 +++++++++++++++++ casanovo/data/annotate_db.py | 9 ++++ tests/conftest.py | 97 ++++++++++++++++++++++++++++++++++++ tests/test_integration.py | 5 ++ 4 files changed, 157 insertions(+) create mode 100644 casanovo/data/annotate_db.py diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 8bdfa58f..dcbecea3 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -42,6 +42,7 @@ from . import utils from .denovo import ModelRunner from .config import Config +from .data.annotate_db import annotate_mgf logger = logging.getLogger("casanovo") click.rich_click.USE_MARKDOWN = True @@ -145,6 +146,51 @@ def sequence( logger.info("DONE!") +@main.command(cls=_SharedParams) +@click.argument( + "peak_path", + required=True, + nargs=1, + type=click.Path(exists=True, dir_okay=False), +) +@click.argument( + "tide_path", + required=True, + nargs=1, + type=click.Path(exists=True, dir_okay=True), +) +def annotate( + peak_path: str, + tide_path: str, + model: Optional[str], + config: Optional[str], + output: Optional[str], + verbosity: str, +) -> None: + """Annotate a given .mgf with candidates as selected by a Tide search for Casanovo-DB. + + PEAK_PATH must be one MGF file from which to annotate spectra. + + TIDE_PATH must be one directory containing the Tide search results of the .mgf. + This directory must contain tide-search.decoy.txt and tide-search.target.txt + """ + for peak_file in peak_path: + logger.info(" %s", peak_file) + + if output is None: + output = setup_logging(output, verbosity) + logger.info( + "Output file not specified. Annotated MGF will be saved in the same directory as the input MGF." + ) + output = peak_path.replace(".mgf", "_annotated.mgf") + else: + output = setup_logging(output, verbosity) + + annotate_mgf(peak_path, tide_path, output) + + logger.info("DONE!") + + @main.command(cls=_SharedParams) @click.argument( "annotated_peak_path", diff --git a/casanovo/data/annotate_db.py b/casanovo/data/annotate_db.py new file mode 100644 index 00000000..984edee0 --- /dev/null +++ b/casanovo/data/annotate_db.py @@ -0,0 +1,9 @@ +"""Methods used to annotate an .mgf so that it can be used by Casanovo-DB""" + +from pathlib import Path +from typing import Optional, Tuple + + +def annotate_mgf(peak_path: str, tide_path: str, output: Optional[str]): + print(peak_path, tide_path, output) + ## TODO diff --git a/tests/conftest.py b/tests/conftest.py index 02a6d0f2..d6db572c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -260,3 +260,100 @@ def tiny_config(tmp_path): yaml.dump(cfg, out_file) return cfg_file + + +@pytest.fixture +def mgf_small_unannotated(tmp_path): + """An MGF file with 2 unannotated spectra and scan numbers.""" + peptides = ["LESLIEK", "PEPTIDEK"] + mgf_file = tmp_path / "small_unannotated.mgf" + return _create_unannotated_mgf(peptides, mgf_file) + + +def _create_unannotated_mgf(peptides, mgf_file, random_state=999): + """ + Create a fake MGF file from one or more peptides. + This file will have no SEQ= parameter, but will have a SCANS= parameter. + + Parameters + ---------- + peptides : str or list of str + The peptides for which to create spectra. + mgf_file : Path + The MGF file to create. + random_state : int or numpy.random.Generator, optional + The random seed. The charge states are chosen to be 2 or 3 randomly. + + Returns + ------- + mgf_file : Path + """ + rng = np.random.default_rng(random_state) + entries = [ + _create_unannotated_mgf_entry(p, idx, rng.choice([2, 3])) + for idx, p in enumerate(peptides) + ] + with mgf_file.open("w+") as mgf_ref: + mgf_ref.write("\n".join(entries)) + + return mgf_file + + +def _create_unannotated_mgf_entry(peptide, scan_num, charge): + """ + Create a MassIVE-KB style MGF entry for a single PSM. + Each entry will have no SEQ= parameter, but will have a SCANS= parameter. + + Parameters + ---------- + peptide : str + A peptide sequence. + scan_num : int + The scan number. + charge : int, optional + The peptide charge state. + + Returns + ------- + str + The PSM entry in an MGF file format. + """ + precursor_mz = calculate_mass(peptide, charge=int(charge)) + mzs, intensities = _peptide_to_peaks(peptide, charge) + frags = "\n".join([f"{m} {i}" for m, i in zip(mzs, intensities)]) + + mgf = [ + "BEGIN IONS", + f"PEPMASS={precursor_mz}", + f"CHARGE={charge}+", + f"SCANS={scan_num}", + f"{frags}", + "END IONS", + ] + return "\n".join(mgf) + + +@pytest.fixture +def tide_dir_small(tmp_path): + """A directory with a very small TIDE search result.""" + tide_dir = tmp_path / "tide_results" + tide_dir.mkdir() + + _create_tide_results_target(tide_dir) + _create_tide_results_decoy(tide_dir) + + return tide_dir + + +def _create_tide_results_target(tide_dir): + """Create a fake TIDE search result file (target).""" + out_file = tide_dir / "tide-search.target.txt" + ## TODO + pass + + +def _create_tide_results_decoy(tide_dir): + """Create a fake TIDE search result file (decoy).""" + out_file = tide_dir / "tide-search.decoy.txt" + ## TODO + pass diff --git a/tests/test_integration.py b/tests/test_integration.py index e5d4b285..50bd1791 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -7,6 +7,11 @@ from casanovo import casanovo +def test_annotate(mgf_small_unannotated, tide_dir_small, tmp_path): + ## TODO + pass + + def test_train_and_run( mgf_small, mzml_small, tiny_config, tmp_path, monkeypatch ): From 30f598481377a69094b500250acbe9852c17b4fb Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Fri, 29 Mar 2024 15:26:34 -0700 Subject: [PATCH 02/84] add basic test for annotate mode --- casanovo/data/annotate_db.py | 125 ++++++++++++++++++++++++++++++++++- tests/conftest.py | 48 +++++++++++--- tests/test_integration.py | 28 +++++++- 3 files changed, 188 insertions(+), 13 deletions(-) diff --git a/casanovo/data/annotate_db.py b/casanovo/data/annotate_db.py index 984edee0..3e154dfb 100644 --- a/casanovo/data/annotate_db.py +++ b/casanovo/data/annotate_db.py @@ -2,8 +2,129 @@ from pathlib import Path from typing import Optional, Tuple +import os +import re +import logging + +import pandas as pd +import pyteomics.mgf as mgf + + +def _normalize_mods(seq: str) -> str: + """ + Turns tide-style modifications into the format used by Casanovo-DB. + + Parameters + ---------- + seq : str + The peptide sequence with tide-style modifications. + + Returns + ------- + str + The peptide sequence with Casanovo-DB-style modifications. + """ + seq = seq.replace("C", "C+57.021") + seq = re.sub(r"M\[15\..*\]", r"M+15.995", seq) + seq = re.sub(r"N\[0\.9.*\]", r"N+0.984", seq) + seq = re.sub(r"Q\[0\.9.*\]", r"Q+0.984", seq) + seq = re.sub(r"(.*)\[42\..*\]", r"+42.011\1", seq) + seq = re.sub(r"(.*)\[43\..*\]", r"+43.006\1", seq) + seq = re.sub(r"(.*)\[\-17\..*\]", r"-17.027\1", seq) + seq = re.sub(r"(.*)\[25\..*\]", r"+43.006-17.027\1", seq) + return seq def annotate_mgf(peak_path: str, tide_path: str, output: Optional[str]): - print(peak_path, tide_path, output) - ## TODO + """ + Accepts a directory containing the results of a successful tide search, and an .mgf file containing MS/MS spectra. + The .mgf file is then annotated in the SEQ field with all of the candidate peptides for each spectrum, as well as their target/decoy status. + This annotated .mgf can be given directly to Casanovo-DB to perfrom a database search. + + Parameters + ---------- + tide_dir_path : str + Path to the directory containing the results of a successful tide search. + mgf_file : str + Path to the .mgf file containing MS/MS spectra. + output_file : str + Path to where the annotated .mgf will be written. + + """ + logger = logging.getLogger("casanovo") + # Get paths to tide search text files + tdf_path = os.path.join(tide_path, "tide-search.target.txt") + ddf_path = os.path.join(tide_path, "tide-search.decoy.txt") + try: + target_df = pd.read_csv( + tdf_path, sep="\t", usecols=["scan", "sequence", "target/decoy"] + ) + decoy_df = pd.read_csv( + ddf_path, sep="\t", usecols=["scan", "sequence", "target/decoy"] + ) + except FileNotFoundError as e: + logger.error( + "Could not find tide search results in the specified directory. " + "Please ensure that the directory contains the following files: " + "tide-search.target.txt and tide-search.decoy.txt" + ) + raise e + + logger.info("Successfully read tide search results from %s.", tide_path) + + df = pd.concat([target_df, decoy_df]) + scan_groups = df.groupby("scan")[["sequence", "target/decoy"]] + + scan_map = {} + + for scan, item in scan_groups: + td_group = item.groupby("target/decoy")["sequence"].apply(list) + if "target" in td_group.index: + target_candidate_list = list( + map( + _normalize_mods, + td_group["target"], + ) + ) + else: + target_candidate_list = [] + logger.warn(f"No target peptides found for scan {scan}.") + if "decoy" in td_group.index: + decoy_candidate_list = list( + map( + _normalize_mods, + td_group["decoy"], + ) + ) + decoy_candidate_list = list( + map(lambda x: "decoy_" + str(x), decoy_candidate_list) + ) + else: + decoy_candidate_list = [] + logger.warn(f"No decoy peptides found for scan {scan}.") + + scan_map[scan] = target_candidate_list + decoy_candidate_list + + all_spec = [] + for idx, spec_dict in enumerate(mgf.read(peak_path)): + try: + scan = int(spec_dict["params"]["scans"]) + except KeyError as e: + logger.error( + "Could not find the scan number in the .mgf file. Please ensure that the .mgf file contains the scan number in the 'SCANS' field." + ) + raise e + try: + spec_dict["params"]["seq"] = ",".join(list(scan_map[scan])) + all_spec.append(spec_dict) + except KeyError as e: + # No need to do anything if the scan is not found in the scan map + pass + try: + output = str(output) + logger.info(output) + mgf.write(all_spec, output, file_mode="w") + logger.info("Annotated .mgf file written to %s.", output) + except Exception as e: + print(f"Write to {output} failed. Check if the file path is correct.") + print(e) diff --git a/tests/conftest.py b/tests/conftest.py index d6db572c..237d0292 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,7 @@ """Fixtures used for testing.""" import numpy as np +import pandas as pd import psims import pytest import yaml @@ -324,6 +325,7 @@ def _create_unannotated_mgf_entry(peptide, scan_num, charge): mgf = [ "BEGIN IONS", + f"TITLE=title::{scan_num}", f"PEPMASS={precursor_mz}", f"CHARGE={charge}+", f"SCANS={scan_num}", @@ -339,21 +341,51 @@ def tide_dir_small(tmp_path): tide_dir = tmp_path / "tide_results" tide_dir.mkdir() - _create_tide_results_target(tide_dir) - _create_tide_results_decoy(tide_dir) + # Key is the scan number + built_dict = { + 0: { + "targets": ["LESLIEK", "PEPTIDEK"], + "decoys": ["KEILSEL", "KEDITEPP"], + }, + 1: { + "targets": ["LESLIEK", "PEPTIDEK"], + "decoys": ["KEILSEL", "KEDITEPP"], + }, + } + + _create_tide_results_target(tide_dir, built_dict) + _create_tide_results_decoy(tide_dir, built_dict) return tide_dir -def _create_tide_results_target(tide_dir): +def _create_tide_results_target(tide_dir, built_dict): """Create a fake TIDE search result file (target).""" out_file = tide_dir / "tide-search.target.txt" - ## TODO - pass + df = pd.DataFrame(columns=["scan", "sequence", "target/decoy"]) + for scan, peptides in built_dict.items(): + entry = pd.DataFrame.from_dict( + { + "scan": [scan] * len(peptides["targets"]), + "sequence": peptides["targets"], + "target/decoy": ["target"] * len(peptides["targets"]), + } + ) + df = pd.concat([df, entry], ignore_index=True) + df.to_csv(out_file, sep="\t", index=True) -def _create_tide_results_decoy(tide_dir): +def _create_tide_results_decoy(tide_dir, built_dict): """Create a fake TIDE search result file (decoy).""" out_file = tide_dir / "tide-search.decoy.txt" - ## TODO - pass + df = pd.DataFrame(columns=["scan", "sequence", "target/decoy"]) + for scan, peptides in built_dict.items(): + entry = pd.DataFrame.from_dict( + { + "scan": [scan] * len(peptides["decoys"]), + "sequence": peptides["decoys"], + "target/decoy": ["decoy"] * len(peptides["decoys"]), + } + ) + df = pd.concat([df, entry], ignore_index=True) + df.to_csv(out_file, sep="\t", index=True) diff --git a/tests/test_integration.py b/tests/test_integration.py index 50bd1791..8228432e 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -7,9 +7,31 @@ from casanovo import casanovo -def test_annotate(mgf_small_unannotated, tide_dir_small, tmp_path): - ## TODO - pass +def test_annotate( + mgf_small_unannotated, tide_dir_small, tiny_config, tmp_path +): + + # Run a command: + run = functools.partial( + CliRunner().invoke, casanovo.main, catch_exceptions=False + ) + + annotate_args = [ + "annotate", + str(mgf_small_unannotated), + str(tide_dir_small), + "--config", + tiny_config, + "--output", + str(tmp_path / "annotated_mgf.mgf"), + ] + + result = run(annotate_args) + + assert result.exit_code == 0 + assert (tmp_path / "annotated_mgf.mgf").exists() + + ## TODO: Write rest of test to verify the output file. def test_train_and_run( From 186bc0fa353517095ff043b91881420438451616 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Mon, 8 Apr 2024 22:05:22 -0700 Subject: [PATCH 03/84] added test case for annotate mode and modified method --- casanovo/data/annotate_db.py | 15 ++++++++------- tests/conftest.py | 13 ++++++++++++- tests/test_integration.py | 20 +++++++++++++++++++- 3 files changed, 39 insertions(+), 9 deletions(-) diff --git a/casanovo/data/annotate_db.py b/casanovo/data/annotate_db.py index 3e154dfb..3ff3c4b7 100644 --- a/casanovo/data/annotate_db.py +++ b/casanovo/data/annotate_db.py @@ -24,14 +24,15 @@ def _normalize_mods(seq: str) -> str: str The peptide sequence with Casanovo-DB-style modifications. """ + logger = logging.getLogger("casanovo") seq = seq.replace("C", "C+57.021") - seq = re.sub(r"M\[15\..*\]", r"M+15.995", seq) - seq = re.sub(r"N\[0\.9.*\]", r"N+0.984", seq) - seq = re.sub(r"Q\[0\.9.*\]", r"Q+0.984", seq) - seq = re.sub(r"(.*)\[42\..*\]", r"+42.011\1", seq) - seq = re.sub(r"(.*)\[43\..*\]", r"+43.006\1", seq) - seq = re.sub(r"(.*)\[\-17\..*\]", r"-17.027\1", seq) - seq = re.sub(r"(.*)\[25\..*\]", r"+43.006-17.027\1", seq) + seq = re.sub(r"M\[15\.[0-9]*\]", r"M+15.995", seq) + seq = re.sub(r"N\[0\.9[0-9]*\]", r"N+0.984", seq) + seq = re.sub(r"Q\[0\.9[0-9]*\]", r"Q+0.984", seq) + seq = re.sub(r"(.*)\[42\.[0-9]*\]", r"+42.011\1", seq) + seq = re.sub(r"(.*)\[43\.[0-9]*\]", r"+43.006\1", seq) + seq = re.sub(r"(.*)\[\-17\.[0-9]*\]", r"-17.027\1", seq) + seq = re.sub(r"(.*)\[25\.[0-9]*\]", r"+43.006-17.027\1", seq) return seq diff --git a/tests/conftest.py b/tests/conftest.py index 237d0292..eed4f39a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -266,7 +266,7 @@ def tiny_config(tmp_path): @pytest.fixture def mgf_small_unannotated(tmp_path): """An MGF file with 2 unannotated spectra and scan numbers.""" - peptides = ["LESLIEK", "PEPTIDEK"] + peptides = ["LESLIEK", "PEPTIDEK", "LESTIEK"] mgf_file = tmp_path / "small_unannotated.mgf" return _create_unannotated_mgf(peptides, mgf_file) @@ -351,6 +351,17 @@ def tide_dir_small(tmp_path): "targets": ["LESLIEK", "PEPTIDEK"], "decoys": ["KEILSEL", "KEDITEPP"], }, + 2: { + "targets": [ + "L[42.011]EM[15.9]SLIM[15.995]EK", + "P[43.01]EN[0.99]PTIQ[0.984]DEK", + ], + "decoys": [ + "K[-17.03]M[15.995]EILSEL", + "K[25.1]EDITEPP", + "KEDIQ[0.984]TEPPQ[0.984]", + ], + }, } _create_tide_results_target(tide_dir, built_dict) diff --git a/tests/test_integration.py b/tests/test_integration.py index 8228432e..d4d86d7d 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -31,7 +31,25 @@ def test_annotate( assert result.exit_code == 0 assert (tmp_path / "annotated_mgf.mgf").exists() - ## TODO: Write rest of test to verify the output file. + # Read in the annotated file + with open(tmp_path / "annotated_mgf.mgf") as f: + annotated_lines = f.readlines() + + # Get each SEQ= line + seq_lines = [line for line in annotated_lines if line.startswith("SEQ=")] + assert len(seq_lines) == 3 + assert ( + seq_lines[0].strip() + == "SEQ=LESLIEK,PEPTIDEK,decoy_KEILSEL,decoy_KEDITEPP" + ) + assert ( + seq_lines[1].strip() + == "SEQ=LESLIEK,PEPTIDEK,decoy_KEILSEL,decoy_KEDITEPP" + ) + assert ( + seq_lines[2].strip() + == "SEQ=+42.011LEM+15.995SLIM+15.995EK,+43.006PEN+0.984PTIQ+0.984DEK,decoy_-17.027KM+15.995EILSEL,decoy_+43.006-17.027KEDITEPP,decoy_KEDIQ+0.984TEPPQ+0.984" + ) def test_train_and_run( From a8f50f473f6b8f1fb5da505c51b62ef0bc3fb24e Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Wed, 10 Apr 2024 23:58:30 -0700 Subject: [PATCH 04/84] very rough sketch of db upgrade (untested) --- casanovo/casanovo.py | 26 +++++ casanovo/data/datasets.py | 71 ++++++++++++++ casanovo/data/ms_io.py | 48 ++++++++++ casanovo/denovo/dataloaders.py | 96 ++++++++++++++++++- casanovo/denovo/model.py | 165 ++++++++++++++++++++++++++++++++ casanovo/denovo/model_runner.py | 115 +++++++++++++++++++++- 6 files changed, 518 insertions(+), 3 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index dcbecea3..c2bce3ef 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -191,6 +191,32 @@ def annotate( logger.info("DONE!") +@main.command(cls=_SharedParams) +@click.argument( + "peak_path", + required=True, + nargs=-1, + type=click.Path(exists=True, dir_okay=False), +) +def db_search( + peak_path: Tuple[str], + model: Optional[str], + config: Optional[str], + output: Optional[str], + verbosity: str, +) -> None: + """Perform a search using Casanovo-DB. + + PEAK_PATH must be one MGF file that has ANNOTATED spectra, as output by annotate mode. + """ + output = setup_logging(output, verbosity) + config, model = setup_model(model, config, output, False) + with ModelRunner(config, model) as runner: + runner.db_search(peak_path, output) + + logger.info("DONE!") + + @main.command(cls=_SharedParams) @click.argument( "annotated_peak_path", diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py index 6244e88f..d0c6f347 100644 --- a/casanovo/data/datasets.py +++ b/casanovo/data/datasets.py @@ -265,3 +265,74 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, float, int, str]: mz_array, int_array, precursor_mz, precursor_charge ) return spectrum, precursor_mz, precursor_charge, peptide + + +class DBSpectrumDataset(AnnotatedSpectrumDataset): + """ + Parse and retrieve collections of annotated MS/MS spectra, additionally keep track of spectrum ids for Casanovo-DB. + + Parameters + ---------- + annotated_spectrum_index : depthcharge.data.SpectrumIndex + The MS/MS spectra to use as a dataset. + n_peaks : Optional[int] + The number of top-n most intense peaks to keep in each spectrum. `None` + retains all peaks. + min_mz : float + The minimum m/z to include. The default is 140 m/z, in order to exclude + TMT and iTRAQ reporter ions. + max_mz : float + The maximum m/z to include. + min_intensity : float + Remove peaks whose intensity is below `min_intensity` percentage of the + base peak intensity. + remove_precursor_tol : float + Remove peaks within the given mass tolerance in Dalton around the + precursor mass. + random_state : Optional[int] + The NumPy random state. ``None`` leaves mass spectra in the order they + were parsed. + """ + + def __getitem__( + self, idx: int + ) -> Tuple[torch.Tensor, float, int, str, Tuple[str, str]]: + """ + Return the annotated MS/MS spectrum with the given index. + + Parameters + ---------- + idx : int + The index of the spectrum to return. + + Returns + ------- + spectrum : torch.Tensor of shape (n_peaks, 2) + A tensor of the spectrum with the m/z and intensity peak values. + precursor_mz : float + The precursor m/z. + precursor_charge : int + The precursor charge. + annotation : str + The peptide annotation of the spectrum. + spectrum_id: Tuple[str, str] + The unique spectrum identifier, formed by its original peak file and + identifier (index or scan number) therein. + """ + ( + mz_array, + int_array, + precursor_mz, + precursor_charge, + peptide, + ) = self.index[idx] + spectrum = self._process_peaks( + mz_array, int_array, precursor_mz, precursor_charge + ) + return ( + spectrum, + precursor_mz, + precursor_charge, + peptide, + self.get_spectrum_id(idx), + ) diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index de69592e..1ae8cd16 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -209,3 +209,51 @@ def save(self) -> None: psm[6], # opt_ms_run[1]_aa_scores ] ) + + +class DBWriter(MztabWriter): + """ + Export DB search results to an mzTab file. + + Parameters + ---------- + filename : str + The name of the mzTab file. + """ + + def save(self) -> None: + """ + Export the DB search results to the mzTab file. + """ + with open(self.filename, "w", newline="") as f: + writer = csv.writer(f, delimiter="\t", lineterminator=os.linesep) + # Write metadata. + for row in self.metadata: + writer.writerow(["MTD", *row]) + # Write PSMs. + writer.writerow( + [ + "PSH", + "spectrum_index", + "sequence", + "precursor", + "score", + "target", + "aa_scores", + ] + ) + for i, psm in enumerate( + natsort.natsorted(self.psms, key=operator.itemgetter(1)), 1 + ): + for psm in list(zip(*psm)): + writer.writerow( + [ + "PSM", + psm[0], # spectrum_index + psm[1], # sequence + psm[2], # precursor + psm[3], # score + bool(psm[4]), # target + psm[5], # aa_scores + ] + ) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index fe5d6237..760b0509 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -9,7 +9,11 @@ import torch from depthcharge.data import AnnotatedSpectrumIndex -from ..data.datasets import AnnotatedSpectrumDataset, SpectrumDataset +from ..data.datasets import ( + AnnotatedSpectrumDataset, + SpectrumDataset, + DBSpectrumDataset, +) class DeNovoDataModule(pl.LightningDataModule): @@ -88,7 +92,7 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: Parameters ---------- - stage : str {"fit", "validate", "test"} + stage : str {"fit", "validate", "test", "db"} The stage indicating which Datasets to prepare. All are prepared by default. annotated: bool @@ -122,6 +126,17 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: ) if self.test_index is not None: self.test_dataset = make_dataset(self.test_index) + if stage in (None, "db"): + make_dataset = functools.partial( + DBSpectrumDataset, + n_peaks=self.n_peaks, + min_mz=self.min_mz, + max_mz=self.max_mz, + min_intensity=self.min_intensity, + remove_precursor_tol=self.remove_precursor_tol, + ) + if self.test_index is not None: + self.test_dataset = make_dataset(self.test_index) def _make_loader( self, @@ -155,6 +170,35 @@ def _make_loader( shuffle=shuffle, ) + def _make_db_loader( + self, dataset: torch.utils.data.Dataset, batch_size: int + ) -> torch.utils.data.DataLoader: + """ + Create a PyTorch DataLoader. + + Parameters + ---------- + dataset : torch.utils.data.Dataset + A PyTorch Dataset. + + Returns + ------- + torch.utils.data.DataLoader + A PyTorch DataLoader. + """ + # Calculate new batch size to saturate previous batch size with PSMs + pep_per_spec = [] + for i in range(min(10, len(dataset))): + pep_per_spec.append(len(dataset[i][3].split(","))) + new_batch_size = int(batch_size // np.mean(pep_per_spec)) + return torch.utils.data.DataLoader( + dataset, + batch_size=new_batch_size, + collate_fn=prepare_db_batch, + pin_memory=True, + num_workers=self.n_workers, + ) + def train_dataloader(self) -> torch.utils.data.DataLoader: """Get the training DataLoader.""" return self._make_loader( @@ -173,6 +217,10 @@ def predict_dataloader(self) -> torch.utils.data.DataLoader: """Get the predict DataLoader.""" return self._make_loader(self.test_dataset, self.eval_batch_size) + def db_dataloader(self) -> torch.utils.data.DataLoader: + """Get the predict DataLoader.""" + return self._make_db_loader(self.test_dataset, self.eval_batch_size) + def prepare_batch( batch: List[Tuple[torch.Tensor, float, int, str]] @@ -211,3 +259,47 @@ def prepare_batch( [precursor_masses, precursor_charges, precursor_mzs] ).T.float() return spectra, precursors, np.asarray(spectrum_ids) + + +def prepare_db_batch( + batch: List[Tuple[torch.Tensor, float, int, str, Tuple[str, str]]] +) -> Tuple[torch.Tensor, torch.Tensor, np.ndarray, Tuple[str, str]]: + """ + Collate MS/MS spectra into a batch meant for Casanovo-DB. + + Parameters + ---------- + batch : List[Tuple[torch.Tensor, float, int, str, Tuple[str, str]]] + A batch of data from an AnnotatedSpectrumDataset, consisting of for each + spectrum (i) a tensor with the m/z and intensity peak values, (ii), the + precursor m/z, (iii) the precursor charge, (iv) the spectrum identifier (peptide), (v) + spectrum identifiers (file and scan). + + Returns + ------- + spectra : torch.Tensor of shape (batch_size, n_peaks, 2) + The padded mass spectra tensor with the m/z and intensity peak values + for each spectrum. + precursors : torch.Tensor of shape (batch_size, 3) + A tensor with the precursor neutral mass, precursor charge, and + precursor m/z. + spectrum_peps : np.ndarray + Peptide sequences + spectrum_ids : Tuple[str, str] + Peak file and spectrum identifier + """ + ( + spectra, + precursor_mzs, + precursor_charges, + spectrum_peps, + spectrum_ids, + ) = list(zip(*batch)) + spectra = torch.nn.utils.rnn.pad_sequence(spectra, batch_first=True) + precursor_mzs = torch.tensor(precursor_mzs) + precursor_charges = torch.tensor(precursor_charges) + precursor_masses = (precursor_mzs - 1.007276) * precursor_charges + precursors = torch.vstack( + [precursor_masses, precursor_charges, precursor_mzs] + ).T.float() + return spectra, precursors, np.asarray(spectrum_peps), spectrum_ids diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 77df6df5..f1466907 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -989,6 +989,171 @@ def configure_optimizers( return [optimizer], {"scheduler": lr_scheduler, "interval": "step"} +class DBSpec2Pep(Spec2Pep): + """ + Inherits Spec2Pep + + Hijacks teacher-forcing implemented in Spec2Pep and uses it to predict scores between a spectra and associated peptide. + Input format is .mgf, with comma-separated targets and decoys in the SEQ field. Decoys should have a prefix of "decoy_". + """ + + num_pairs = 1024 + decoy_prefix = "decoy_" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def predict_step(self, batch, *args): + batch_res = [] + for ( + indexes, + t_or_d, + peptides, + precursors, + encoded_ms, + ) in self.smart_batch_gen(batch): + with torch.set_grad_enabled(True): + pred, truth = self.decoder(peptides, precursors, *encoded_ms) + sm = torch.nn.Softmax(dim=2) + pred = sm(pred) + score_result, per_aa_score = calc_match_score( + pred, truth + ) # Calculate the score between spectra + peptide list + batch_res.append( + ( + indexes, + t_or_d, + peptides, + score_result, + per_aa_score, + precursors, + ) + ) + return batch_res + + def smart_batch_gen(self, batch): + all_psm = [] + enc = self.encoder(batch[0]) + precursors = batch[1] + indexes = batch[3] + enc = list(zip(*enc)) + for idx, _ in enumerate(batch[0]): + spec_peptides = batch[2][idx].split(",") + # Check for decoy prefixes and create a bit-vector indicating targets (1) or decoys (0) + t_or_ds = [ + 0 if p.startswith(self.decoy_prefix) else 1 + for p in spec_peptides + ] + # Remove decoy prefix + spec_peptides = [ + s[len(self.decoy_prefix) :] + if s.startswith(self.decoy_prefix) + else s + for s in spec_peptides + ] + spec_precursors = [precursors[idx]] * len(spec_peptides) + spec_enc = [enc[idx]] * len(spec_peptides) + spec_idx = [indexes[idx]] * len(spec_peptides) + all_psm.extend( + list( + zip( + spec_enc, + spec_precursors, + spec_peptides, + spec_idx, + t_or_ds, + ) + ) + ) + # Continually grab num_pairs items from all_psm until list is exhausted + while len(all_psm) > 0: + batch = all_psm[: self.num_pairs] + all_psm = all_psm[self.num_pairs :] + batch = list(zip(*batch)) + encoded_ms = ( + torch.stack([a[0] for a in batch[0]]), + torch.stack([a[1] for a in batch[0]]), + ) + prec_data = torch.stack(batch[1]) + pep_str = list(batch[2]) + indexes = [a[1] for a in batch[3]] + t_or_ds = batch[4] + yield (indexes, t_or_ds, pep_str, prec_data, encoded_ms) + + def on_predict_batch_end( + self, + outputs: List[Tuple[np.ndarray, List[str], torch.Tensor]], + *args, + ) -> None: + if self.out_writer is None: + return + ( + indexes, + t_or_d, + peptides, + score_result, + per_aa_score, + precursors, + ) = list(zip(*outputs)) + for index, t_or_d, peptide, score, per_aa_scores, precursor in zip( + indexes, t_or_d, peptides, score_result, per_aa_score, precursors + ): + per_aa_scores = per_aa_scores.cpu().numpy() + per_aa_scores = list(per_aa_scores[per_aa_scores != 0]) + score = score.cpu().numpy() + precursor = precursor.cpu().numpy() + self.out_writer.psms.append( + (index, peptide, precursor, score, t_or_d, per_aa_scores), + ) + + +def calc_match_score( + batch_all_aa_scores: torch.Tensor, truth_aa_indicies: torch.Tensor +) -> List[float]: + """ + Take in teacher-forced scoring of amino acids of the peptides (in a batch) and use the truth labels + to calculate a score between the input spectra and associated peptide. The score is the geometric + mean of the AA probabilities + + Parameters + ---------- + batch_all_aa_scores : torch.Tensor + Amino acid scores for all amino acids in the vocabulary for every prediction made to generate the associated peptide (for an entire batch) + truth_aa_indicies : torch.Tensor + Indicies of the score for each actual amino acid in the peptide (for an entire batch) + + Returns + ------- + score : list[float], list[list[float]] + The score between the input spectra and associated peptide (for an entire batch) + a list of lists of per amino acid scores (for an entire batch) + """ + # Remove trailing tokens from predictions, + batch_all_aa_scores = batch_all_aa_scores[:, :-1] + + # Vectorized scoring using efficient indexing. + rows = ( + torch.arange(batch_all_aa_scores.shape[0]) + .unsqueeze(-1) + .expand(-1, batch_all_aa_scores.shape[1]) + ) + cols = torch.arange(0, batch_all_aa_scores.shape[1]).expand_as(rows) + + per_aa_scores = batch_all_aa_scores[rows, cols, truth_aa_indicies] + + score_mask = truth_aa_indicies != 0 + masked_per_aa_scores = per_aa_scores * score_mask + # all_scores = masked_per_aa_scores.sum(dim=1) / score_mask.sum(dim=1) # Calculated arithmetic score + all_scores = torch.where( + torch.log(masked_per_aa_scores) == float("-inf"), + torch.tensor(0.0), + torch.log(masked_per_aa_scores), + ).sum(dim=1) / score_mask.sum( + dim=1 + ) # Calculates geometric score + return all_scores, masked_per_aa_scores + + class CosineWarmupScheduler(torch.optim.lr_scheduler._LRScheduler): """ Learning rate scheduler with linear warm-up followed by cosine shaped decay. diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 4bd2165e..f70e2be6 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -20,7 +20,7 @@ from ..config import Config from ..data import ms_io from ..denovo.dataloaders import DeNovoDataModule -from ..denovo.model import Spec2Pep +from ..denovo.model import Spec2Pep, DBSpec2Pep logger = logging.getLogger("casanovo") @@ -79,6 +79,25 @@ def __exit__(self, exc_type, exc_value, traceback): if self.writer is not None: self.writer.save() + def db_search(self, peak_path: Iterable[str], output: str) -> None: + """Casanovo-DB TODO DOCS""" + self.writer = ms_io.DBWriter(Path(output).with_suffix(".mztab")) + self.writer.set_metadata( + self.config, + model=str(self.model_filename), + config_filename=self.config.file, + ) + + self.initialize_trainer(train=True) + self.initialize_db_model() + self.model.out_writer = self.writer + + test_index = self._get_index(peak_path, True, "db search") + self.writer.set_ms_run(test_index.ms_files) + self.initialize_data_module(test_index=test_index) + self.loaders.setup(stage="db") + self.trainer.predict(self.model, self.loaders.db_dataloader()) + def train( self, train_peak_path: Iterable[str], @@ -198,6 +217,100 @@ def initialize_trainer(self, train: bool) -> None: self.trainer = pl.Trainer(**trainer_cfg) + def initialize_db_model(self) -> None: + """Initialize the Casanovo-DB model. + Required because the DB search model is a unique subclass of the Spec2Pep model. + """ + model_params = dict( + dim_model=self.config.dim_model, + n_head=self.config.n_head, + dim_feedforward=self.config.dim_feedforward, + n_layers=self.config.n_layers, + dropout=self.config.dropout, + dim_intensity=self.config.dim_intensity, + max_length=self.config.max_length, + residues=self.config.residues, + max_charge=self.config.max_charge, + precursor_mass_tol=self.config.precursor_mass_tol, + isotope_error_range=self.config.isotope_error_range, + min_peptide_len=self.config.min_peptide_len, + n_beams=self.config.n_beams, + top_match=self.config.top_match, + n_log=self.config.n_log, + tb_summarywriter=self.config.tb_summarywriter, + train_label_smoothing=self.config.train_label_smoothing, + warmup_iters=self.config.warmup_iters, + cosine_schedule_period_iters=self.config.cosine_schedule_period_iters, + lr=self.config.learning_rate, + weight_decay=self.config.weight_decay, + out_writer=self.writer, + calculate_precision=self.config.calculate_precision, + ) + + # Reconfigurable non-architecture related parameters for a loaded model. + loaded_model_params = dict( + max_length=self.config.max_length, + precursor_mass_tol=self.config.precursor_mass_tol, + isotope_error_range=self.config.isotope_error_range, + n_beams=self.config.n_beams, + min_peptide_len=self.config.min_peptide_len, + top_match=self.config.top_match, + n_log=self.config.n_log, + tb_summarywriter=self.config.tb_summarywriter, + train_label_smoothing=self.config.train_label_smoothing, + warmup_iters=self.config.warmup_iters, + cosine_schedule_period_iters=self.config.cosine_schedule_period_iters, + lr=self.config.learning_rate, + weight_decay=self.config.weight_decay, + out_writer=self.writer, + calculate_precision=self.config.calculate_precision, + ) + + # Model file must exist for DB search + if self.model_filename is None: + logger.error("A model file must be provided") + raise ValueError("A model file must be provided") + + if not Path(self.model_filename).exists(): + logger.error( + "Could not find the model weights at file %s", + self.model_filename, + ) + raise FileNotFoundError("Could not find the model weights file") + + # First try loading model details from the weights file, otherwise use + # the provided configuration. + device = torch.empty(1).device # Use the default device. + try: + self.model = DBSpec2Pep.load_from_checkpoint( + self.model_filename, map_location=device, **loaded_model_params + ) + + architecture_params = set(model_params.keys()) - set( + loaded_model_params.keys() + ) + for param in architecture_params: + if model_params[param] != self.model.hparams[param]: + warnings.warn( + f"Mismatching {param} parameter in " + f"model checkpoint ({self.model.hparams[param]}) " + f"vs config file ({model_params[param]}); " + "using the checkpoint." + ) + except RuntimeError: + # This only doesn't work if the weights are from an older version + try: + self.model = DBSpec2Pep.load_from_checkpoint( + self.model_filename, + map_location=device, + **model_params, + ) + except RuntimeError: + raise RuntimeError( + "Weights file incompatible with the current version of " + "Casanovo." + ) + def initialize_model(self, train: bool) -> None: """Initialize the Casanovo model. From dae9c8a78b712575ed699e690ce674b7f6a46377 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Sun, 14 Apr 2024 21:27:41 -0700 Subject: [PATCH 05/84] small upgrades to documentation --- casanovo/denovo/dataloaders.py | 2 +- casanovo/denovo/model.py | 48 +++++++++++++++------------------ casanovo/denovo/model_runner.py | 19 ++++++++++++- 3 files changed, 41 insertions(+), 28 deletions(-) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 760b0509..efb346ab 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -190,7 +190,7 @@ def _make_db_loader( pep_per_spec = [] for i in range(min(10, len(dataset))): pep_per_spec.append(len(dataset[i][3].split(","))) - new_batch_size = int(batch_size // np.mean(pep_per_spec)) + new_batch_size = max(1, int(batch_size // np.mean(pep_per_spec))) return torch.utils.data.DataLoader( dataset, batch_size=new_batch_size, diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index f1466907..d31820f2 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -997,13 +997,13 @@ class DBSpec2Pep(Spec2Pep): Input format is .mgf, with comma-separated targets and decoys in the SEQ field. Decoys should have a prefix of "decoy_". """ - num_pairs = 1024 - decoy_prefix = "decoy_" + num_pairs = None # Modified to be predict_batch_size from config def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def predict_step(self, batch, *args): + logger.info("New batch") batch_res = [] for ( indexes, @@ -1012,23 +1012,22 @@ def predict_step(self, batch, *args): precursors, encoded_ms, ) in self.smart_batch_gen(batch): - with torch.set_grad_enabled(True): - pred, truth = self.decoder(peptides, precursors, *encoded_ms) - sm = torch.nn.Softmax(dim=2) - pred = sm(pred) - score_result, per_aa_score = calc_match_score( - pred, truth - ) # Calculate the score between spectra + peptide list - batch_res.append( - ( - indexes, - t_or_d, - peptides, - score_result, - per_aa_score, - precursors, - ) + pred, truth = self.decoder(peptides, precursors, *encoded_ms) + sm = torch.nn.Softmax(dim=2) + pred = sm(pred) + score_result, per_aa_score = calc_match_score( + pred, truth + ) # Calculate the score between spectra + peptide list + batch_res.append( + ( + indexes, + t_or_d, + peptides, + score_result.cpu().detach().numpy(), + per_aa_score.cpu().detach().numpy(), + precursors.cpu().detach().numpy(), ) + ) return batch_res def smart_batch_gen(self, batch): @@ -1040,15 +1039,13 @@ def smart_batch_gen(self, batch): for idx, _ in enumerate(batch[0]): spec_peptides = batch[2][idx].split(",") # Check for decoy prefixes and create a bit-vector indicating targets (1) or decoys (0) + decoy_prefix = "decoy_" # Decoy prefix t_or_ds = [ - 0 if p.startswith(self.decoy_prefix) else 1 - for p in spec_peptides + 0 if p.startswith(decoy_prefix) else 1 for p in spec_peptides ] # Remove decoy prefix spec_peptides = [ - s[len(self.decoy_prefix) :] - if s.startswith(self.decoy_prefix) - else s + s[len(decoy_prefix) :] if s.startswith(decoy_prefix) else s for s in spec_peptides ] spec_precursors = [precursors[idx]] * len(spec_peptides) @@ -1066,6 +1063,8 @@ def smart_batch_gen(self, batch): ) ) # Continually grab num_pairs items from all_psm until list is exhausted + logger.info(f"Received {len(all_psm)} PSMs") + logger.info(f"Processing num_pairs: {self.num_pairs}") while len(all_psm) > 0: batch = all_psm[: self.num_pairs] all_psm = all_psm[self.num_pairs :] @@ -1098,10 +1097,7 @@ def on_predict_batch_end( for index, t_or_d, peptide, score, per_aa_scores, precursor in zip( indexes, t_or_d, peptides, score_result, per_aa_score, precursors ): - per_aa_scores = per_aa_scores.cpu().numpy() per_aa_scores = list(per_aa_scores[per_aa_scores != 0]) - score = score.cpu().numpy() - precursor = precursor.cpu().numpy() self.out_writer.psms.append( (index, peptide, precursor, score, t_or_d, per_aa_scores), ) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index f70e2be6..e6956049 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -80,7 +80,19 @@ def __exit__(self, exc_type, exc_value, traceback): self.writer.save() def db_search(self, peak_path: Iterable[str], output: str) -> None: - """Casanovo-DB TODO DOCS""" + """Perform database search with Casanovo. + + Parameters + ---------- + peak_path : iterable of str + The path to the annotated .mgf data files for database search. + output : str + Where should the output be saved? + + Returns + ------- + self + """ self.writer = ms_io.DBWriter(Path(output).with_suffix(".mztab")) self.writer.set_metadata( self.config, @@ -286,6 +298,9 @@ def initialize_db_model(self) -> None: self.model_filename, map_location=device, **loaded_model_params ) + # Pass in information about predict_batch_size to the model for batch saturation + self.model.num_pairs = self.config.predict_batch_size + architecture_params = set(model_params.keys()) - set( loaded_model_params.keys() ) @@ -305,6 +320,8 @@ def initialize_db_model(self) -> None: map_location=device, **model_params, ) + # Pass in information about predict_batch_size to the model for batch saturation + self.model.num_pairs = self.config.predict_batch_size except RuntimeError: raise RuntimeError( "Weights file incompatible with the current version of " From 7f95ae5a0c07f76cfea3004daabfe9bb832423fd Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Sun, 14 Apr 2024 23:01:31 -0700 Subject: [PATCH 06/84] better output formatting --- casanovo/data/ms_io.py | 30 +++++++++++++++++++++--------- casanovo/denovo/model.py | 6 +----- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index 1ae8cd16..922a6de7 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -236,24 +236,36 @@ def save(self) -> None: "PSH", "spectrum_index", "sequence", - "precursor", + "precursor_mass", + "precursor_charge", + "precursor_mz", "score", "target", "aa_scores", ] ) for i, psm in enumerate( - natsort.natsorted(self.psms, key=operator.itemgetter(1)), 1 + natsort.natsorted(self.psms, key=operator.itemgetter(0)), 1 ): - for psm in list(zip(*psm)): + # [precursor_masses, precursor_charges, precursor_mzs] + for rowinfo in list(zip(*psm)): writer.writerow( [ "PSM", - psm[0], # spectrum_index - psm[1], # sequence - psm[2], # precursor - psm[3], # score - bool(psm[4]), # target - psm[5], # aa_scores + rowinfo[0], # spectrum_index + rowinfo[1], # sequence + rowinfo[2][0], # precursor mass + int(rowinfo[2][1]), # precursor charge + rowinfo[2][2], # precursor m/z + rowinfo[3], # score + bool(rowinfo[4]), # target + ",".join( + list( + map( + "{:.5f}".format, + rowinfo[5][rowinfo[5] != 0], + ) + ) + ), # aa_scores including stop token ] ) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index d31820f2..30e8862e 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1003,7 +1003,6 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def predict_step(self, batch, *args): - logger.info("New batch") batch_res = [] for ( indexes, @@ -1063,8 +1062,6 @@ def smart_batch_gen(self, batch): ) ) # Continually grab num_pairs items from all_psm until list is exhausted - logger.info(f"Received {len(all_psm)} PSMs") - logger.info(f"Processing num_pairs: {self.num_pairs}") while len(all_psm) > 0: batch = all_psm[: self.num_pairs] all_psm = all_psm[self.num_pairs :] @@ -1097,7 +1094,6 @@ def on_predict_batch_end( for index, t_or_d, peptide, score, per_aa_scores, precursor in zip( indexes, t_or_d, peptides, score_result, per_aa_score, precursors ): - per_aa_scores = list(per_aa_scores[per_aa_scores != 0]) self.out_writer.psms.append( (index, peptide, precursor, score, t_or_d, per_aa_scores), ) @@ -1139,7 +1135,7 @@ def calc_match_score( score_mask = truth_aa_indicies != 0 masked_per_aa_scores = per_aa_scores * score_mask - # all_scores = masked_per_aa_scores.sum(dim=1) / score_mask.sum(dim=1) # Calculated arithmetic score + # all_scores = masked_per_aa_scores.sum(dim=1) / score_mask.sum(dim=1) # Calculated arithmetic score that was used before all_scores = torch.where( torch.log(masked_per_aa_scores) == float("-inf"), torch.tensor(0.0), From 278436b2b93be01f49632b5c7a00d4f16c31d8f0 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Sat, 27 Apr 2024 13:16:23 -0700 Subject: [PATCH 07/84] all tests added --- casanovo/config.yaml | 3 +- casanovo/data/ms_io.py | 83 ++++++++++++++++++++++------------- casanovo/denovo/model.py | 40 ++++++++++++----- tests/test_integration.py | 77 ++++++++++++++++++++++++++++++++ tests/unit_tests/test_unit.py | 50 ++++++++++++++++++++- 5 files changed, 209 insertions(+), 44 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index c7186ff7..7b207664 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -15,7 +15,8 @@ precursor_mass_tol: 50 # ppm isotope_error_range: [0, 1] # The minimum length of predicted peptides. min_peptide_len: 6 -# Number of spectra in one inference batch. +# Number of spectra in one inference batch. +# Also the number of PSMs processed by Casanovo-DB per batch. predict_batch_size: 1024 # Number of beams used in beam search. n_beams: 1 diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index 922a6de7..ae4f3b54 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -224,6 +224,7 @@ class DBWriter(MztabWriter): def save(self) -> None: """ Export the DB search results to the mzTab file. + Outputs PSMs in the order they were scored (i.e. the order in the annotated .mgf file). """ with open(self.filename, "w", newline="") as f: writer = csv.writer(f, delimiter="\t", lineterminator=os.linesep) @@ -234,38 +235,58 @@ def save(self) -> None: writer.writerow( [ "PSH", - "spectrum_index", "sequence", - "precursor_mass", - "precursor_charge", - "precursor_mz", - "score", - "target", - "aa_scores", + "PSM_ID", + "accession", + "unique", + "database", + "database_version", + "search_engine", + "search_engine_score[1]", + "modifications", + "retention_time", + "charge", + "exp_mass_to_charge", + "calc_mass_to_charge", + "spectra_ref", + "pre", + "post", + "start", + "end", + "opt_ms_run[1]_aa_scores", + "opt_target", ] ) - for i, psm in enumerate( - natsort.natsorted(self.psms, key=operator.itemgetter(0)), 1 - ): - # [precursor_masses, precursor_charges, precursor_mzs] - for rowinfo in list(zip(*psm)): - writer.writerow( - [ - "PSM", - rowinfo[0], # spectrum_index - rowinfo[1], # sequence - rowinfo[2][0], # precursor mass - int(rowinfo[2][1]), # precursor charge - rowinfo[2][2], # precursor m/z - rowinfo[3], # score - bool(rowinfo[4]), # target - ",".join( - list( - map( - "{:.5f}".format, - rowinfo[5][rowinfo[5] != 0], - ) + for i, psm in enumerate(self.psms): + writer.writerow( + [ + "PSM", + psm[0], # sequence + f"{psm[5]}:{i}", # spectra_ref + "null", # accession + "null", # unique + "null", # database + "null", # database_version + "null", # search_engine + psm[1], # search_engine_score[1] + "null", # modifications + "null", # retention_time + int(psm[2]), # charge + psm[3], # exp_mass_to_charge + psm[4], # calc_mass_to_charge + psm[5], # spectra_ref + "null", # pre + "null", # post + "null", # start + "null", # end + ",".join( + list( + map( + "{:.5f}".format, + psm[6][psm[6] != 0], ) - ), # aa_scores including stop token - ] - ) + ) + ), # opt_ms_run[1]_aa_scores + bool(psm[7]), # opt_target + ] + ) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 30e8862e..4efe0f92 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1014,7 +1014,7 @@ def predict_step(self, batch, *args): pred, truth = self.decoder(peptides, precursors, *encoded_ms) sm = torch.nn.Softmax(dim=2) pred = sm(pred) - score_result, per_aa_score = calc_match_score( + score_result, per_aa_score = _calc_match_score( pred, truth ) # Calculate the score between spectra + peptide list batch_res.append( @@ -1083,23 +1083,40 @@ def on_predict_batch_end( ) -> None: if self.out_writer is None: return - ( + for ( indexes, t_or_d, peptides, score_result, per_aa_score, precursors, - ) = list(zip(*outputs)) - for index, t_or_d, peptide, score, per_aa_scores, precursor in zip( - indexes, t_or_d, peptides, score_result, per_aa_score, precursors - ): - self.out_writer.psms.append( - (index, peptide, precursor, score, t_or_d, per_aa_scores), - ) + ) in outputs: + for index, t_or_d, peptide, score, per_aa_scores, precursor in zip( + indexes, + t_or_d, + peptides, + score_result, + per_aa_score, + precursors, + ): + prec_charge = precursor[1] + prec_mz = precursor[2] + calc_mz = precursor[2] + self.out_writer.psms.append( + ( + peptide, + score, + prec_charge, + prec_mz, + calc_mz, + index, + per_aa_scores, + t_or_d, + ), + ) -def calc_match_score( +def _calc_match_score( batch_all_aa_scores: torch.Tensor, truth_aa_indicies: torch.Tensor ) -> List[float]: """ @@ -1135,7 +1152,8 @@ def calc_match_score( score_mask = truth_aa_indicies != 0 masked_per_aa_scores = per_aa_scores * score_mask - # all_scores = masked_per_aa_scores.sum(dim=1) / score_mask.sum(dim=1) # Calculated arithmetic score that was used before + # Arithmetic score that was used before + ## all_scores = masked_per_aa_scores.sum(dim=1) / score_mask.sum(dim=1) all_scores = torch.where( torch.log(masked_per_aa_scores) == float("-inf"), torch.tensor(0.0), diff --git a/tests/test_integration.py b/tests/test_integration.py index d4d86d7d..73232fa7 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -52,6 +52,83 @@ def test_annotate( ) +def test_db_search( + mgf_small_unannotated, tide_dir_small, tiny_config, tmp_path +): + # Run a command: + run = functools.partial( + CliRunner().invoke, casanovo.main, catch_exceptions=False + ) + + annotate_args = [ + "annotate", + str(mgf_small_unannotated), + str(tide_dir_small), + "--config", + tiny_config, + "--output", + str(tmp_path / "annotated_mgf.mgf"), + ] + + result = run(annotate_args) + + assert result.exit_code == 0 + assert (tmp_path / "annotated_mgf.mgf").exists() + + # Follow up annotate run with db search + + output_path = tmp_path / "db_search.mztab" + + search_args = [ + "db-search", + str(tmp_path / "annotated_mgf.mgf"), + "--config", + tiny_config, + "--output", + str(output_path), + ] + + result = run(search_args) + + assert result.exit_code == 0 + assert output_path.exists() + assert output_path.is_file() + + mztab = pyteomics.mztab.MzTab(str(output_path)) + + psms = mztab.spectrum_match_table + assert list(psms.sequence) == [ + "LESLIEK", + "PEPTIDEK", + "KEILSEL", + "KEDITEPP", + "LESLIEK", + "PEPTIDEK", + "KEILSEL", + "KEDITEPP", + "+42.011LEM+15.995SLIM+15.995EK", + "+43.006PEN+0.984PTIQ+0.984DEK", + "-17.027KM+15.995EILSEL", + "+43.006-17.027KEDITEPP", + "KEDIQ+0.984TEPPQ+0.984", + ] + assert list(psms.opt_target) == [ + "True", + "True", + "False", + "False", + "True", + "True", + "False", + "False", + "True", + "True", + "False", + "False", + "False", + ] + + def test_train_and_run( mgf_small, mzml_small, tiny_config, tmp_path, monkeypatch ): diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index f615a099..bcc61446 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -16,7 +16,7 @@ from casanovo.data import ms_io from casanovo.data.datasets import SpectrumDataset, AnnotatedSpectrumDataset from casanovo.denovo.evaluate import aa_match_batch, aa_match_metrics -from casanovo.denovo.model import Spec2Pep, _aa_pep_score +from casanovo.denovo.model import Spec2Pep, _aa_pep_score, _calc_match_score from depthcharge.data import SpectrumIndex, AnnotatedSpectrumIndex @@ -139,6 +139,54 @@ def test_aa_pep_score(): assert peptide_score == pytest.approx(-0.5) +def test_calc_match_score(): + """ + Test the calculation of geometric scores using teacher-forced + decoder output probabilities and ground truth amino acid sequences. + """ + first_slot_prob = torch.zeros(29) + first_slot_prob[1] = 1.0 # A + second_slot_prob = torch.zeros(29) + second_slot_prob[2] = 1.0 # B + third_slot_prob = torch.zeros(29) + third_slot_prob[3] = 1.0 # C + stop_slot_prob = torch.zeros(29) + stop_slot_prob[28] = 1.0 # $ + blank_slot_prob = torch.zeros(29) + + pep_1_aa = torch.stack( + [ + first_slot_prob, + second_slot_prob, + third_slot_prob, + stop_slot_prob, + blank_slot_prob, + ] + ) + pep_2_aa = torch.stack( + [ + third_slot_prob, + second_slot_prob, + stop_slot_prob, + blank_slot_prob, + blank_slot_prob, + ] + ) + + batch_all_aa_scores = torch.stack([pep_1_aa, pep_2_aa]) + truth_aa_indices = torch.tensor([[1, 2, 3, 28], [3, 2, 28, 0]]) + + all_scores, masked_per_aa_scores = _calc_match_score( + batch_all_aa_scores, truth_aa_indices + ) + + assert all_scores.numpy()[0] == pytest.approx(0) + assert all_scores.numpy()[1] == pytest.approx(0) + + assert np.sum(masked_per_aa_scores.numpy()[0]) == pytest.approx(4) + assert np.sum(masked_per_aa_scores.numpy()[1]) == pytest.approx(3) + + def test_beam_search_decode(): """ Test beam search decoding and its sub-functions. From 949ea9392c625b63b20fceba25e1bea973d68cba Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Sat, 27 Apr 2024 13:25:49 -0700 Subject: [PATCH 08/84] remove minor debugging print statement --- casanovo/data/annotate_db.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/casanovo/data/annotate_db.py b/casanovo/data/annotate_db.py index 3ff3c4b7..db27b05f 100644 --- a/casanovo/data/annotate_db.py +++ b/casanovo/data/annotate_db.py @@ -127,5 +127,6 @@ def annotate_mgf(peak_path: str, tide_path: str, output: Optional[str]): mgf.write(all_spec, output, file_mode="w") logger.info("Annotated .mgf file written to %s.", output) except Exception as e: - print(f"Write to {output} failed. Check if the file path is correct.") - print(e) + logger.error( + "Write to %s failed. Check if the file path is correct.", output + ) From da5ef5e29f8337974815b9c09c27e09967182e25 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 27 Apr 2024 20:28:46 +0000 Subject: [PATCH 09/84] Generate new screengrabs with rich-codex --- docs/images/help.svg | 213 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 docs/images/help.svg diff --git a/docs/images/help.svg b/docs/images/help.svg new file mode 100644 index 00000000..533b5f70 --- /dev/null +++ b/docs/images/help.svg @@ -0,0 +1,213 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $ casanovo --help + +Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     + + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  + ┃                                  Casanovo                                  ┃  + ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  + Casanovo de novo sequences peptides from tandem mass spectra using a            + Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   + de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  + training new models.                                                            + + Links:                                                                          + + • Documentation: https://casanovo.readthedocs.io + • Official code repository: https://github.com/Noble-Lab/casanovo + + If you use Casanovo in your work, please cite:                                  + + • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   +mass spectrometry peptide sequencing with a transformer model. Proceedings   +of the 39th International Conference on Machine Learning - ICML '22 (2022)   +doi:10.1101/2022.02.07.479481.                                               + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--help-h    Show this message and exit.                                     +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Commands ───────────────────────────────────────────────────────────────────╮ +annotate  Annotate a given .mgf with candidates as selected by a Tide        + search for Casanovo-DB.                                            +configure Generate a Casanovo configuration file to customize.               +db-search Perform a search using Casanovo-DB.                                +evaluate  Evaluate de novo peptide sequencing performance.                   +sequence  De novo sequence peptides from tandem mass spectra.                +train     Train a Casanovo model on your own data.                           +version   Get the Casanovo version information                               +╰──────────────────────────────────────────────────────────────────────────────╯ + + + + + From 53f6bec021f2dd19285278317b0f56370b0652ab Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Sat, 27 Apr 2024 13:52:39 -0700 Subject: [PATCH 10/84] remove excess info logs, add monkeypatch to tests --- casanovo/data/annotate_db.py | 1 - tests/test_integration.py | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/casanovo/data/annotate_db.py b/casanovo/data/annotate_db.py index db27b05f..4f33b798 100644 --- a/casanovo/data/annotate_db.py +++ b/casanovo/data/annotate_db.py @@ -123,7 +123,6 @@ def annotate_mgf(peak_path: str, tide_path: str, output: Optional[str]): pass try: output = str(output) - logger.info(output) mgf.write(all_spec, output, file_mode="w") logger.info("Annotated .mgf file written to %s.", output) except Exception as e: diff --git a/tests/test_integration.py b/tests/test_integration.py index 73232fa7..56fb0790 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -8,10 +8,11 @@ def test_annotate( - mgf_small_unannotated, tide_dir_small, tiny_config, tmp_path + mgf_small_unannotated, tide_dir_small, tiny_config, tmp_path, monkeypatch ): # Run a command: + monkeypatch.setattr(casanovo, "__version__", "3.0.1") run = functools.partial( CliRunner().invoke, casanovo.main, catch_exceptions=False ) @@ -53,9 +54,10 @@ def test_annotate( def test_db_search( - mgf_small_unannotated, tide_dir_small, tiny_config, tmp_path + mgf_small_unannotated, tide_dir_small, tiny_config, tmp_path, monkeypatch ): # Run a command: + monkeypatch.setattr(casanovo, "__version__", "3.0.1") run = functools.partial( CliRunner().invoke, casanovo.main, catch_exceptions=False ) From 81aa073c55510cb7842bdb5a1b983bcda1a66457 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Sat, 27 Apr 2024 13:59:22 -0700 Subject: [PATCH 11/84] mp fix --- tests/test_integration.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 56fb0790..e8654c68 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -8,11 +8,10 @@ def test_annotate( - mgf_small_unannotated, tide_dir_small, tiny_config, tmp_path, monkeypatch + mgf_small_unannotated, tide_dir_small, tiny_config, tmp_path ): # Run a command: - monkeypatch.setattr(casanovo, "__version__", "3.0.1") run = functools.partial( CliRunner().invoke, casanovo.main, catch_exceptions=False ) @@ -57,7 +56,7 @@ def test_db_search( mgf_small_unannotated, tide_dir_small, tiny_config, tmp_path, monkeypatch ): # Run a command: - monkeypatch.setattr(casanovo, "__version__", "3.0.1") + monkeypatch.setattr(casanovo, "__version__", "4.1.1") run = functools.partial( CliRunner().invoke, casanovo.main, catch_exceptions=False ) From 0ecbd80c9e209d3b796e6596bbb8b665bd05900c Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Mon, 6 May 2024 23:26:39 -0700 Subject: [PATCH 12/84] fix line lengths and modify test --- casanovo/denovo/model.py | 25 +++++++++++++++++-------- tests/test_integration.py | 7 ++++--- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 4efe0f92..ec234691 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -993,8 +993,10 @@ class DBSpec2Pep(Spec2Pep): """ Inherits Spec2Pep - Hijacks teacher-forcing implemented in Spec2Pep and uses it to predict scores between a spectra and associated peptide. - Input format is .mgf, with comma-separated targets and decoys in the SEQ field. Decoys should have a prefix of "decoy_". + Hijacks teacher-forcing implemented in Spec2Pep and + uses it to predict scores between a spectra and associated peptide. + Input format is .mgf, with comma-separated targets + and decoys in the SEQ field. Decoys should have a prefix of "decoy_". """ num_pairs = None # Modified to be predict_batch_size from config @@ -1120,22 +1122,29 @@ def _calc_match_score( batch_all_aa_scores: torch.Tensor, truth_aa_indicies: torch.Tensor ) -> List[float]: """ - Take in teacher-forced scoring of amino acids of the peptides (in a batch) and use the truth labels - to calculate a score between the input spectra and associated peptide. The score is the geometric + Take in teacher-forced scoring of amino acids + of the peptides (in a batch) and use the truth labels + to calculate a score between the input spectra and + associated peptide. The score is the geometric mean of the AA probabilities Parameters ---------- batch_all_aa_scores : torch.Tensor - Amino acid scores for all amino acids in the vocabulary for every prediction made to generate the associated peptide (for an entire batch) + Amino acid scores for all amino acids in + the vocabulary for every prediction made to generate + the associated peptide (for an entire batch) truth_aa_indicies : torch.Tensor - Indicies of the score for each actual amino acid in the peptide (for an entire batch) + Indicies of the score for each actual amino acid + in the peptide (for an entire batch) Returns ------- score : list[float], list[list[float]] - The score between the input spectra and associated peptide (for an entire batch) - a list of lists of per amino acid scores (for an entire batch) + The score between the input spectra and associated peptide + (for an entire batch) + a list of lists of per amino acid scores + (for an entire batch) """ # Remove trailing tokens from predictions, batch_all_aa_scores = batch_all_aa_scores[:, :-1] diff --git a/tests/test_integration.py b/tests/test_integration.py index e8654c68..3ad1a4f4 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -47,8 +47,9 @@ def test_annotate( == "SEQ=LESLIEK,PEPTIDEK,decoy_KEILSEL,decoy_KEDITEPP" ) assert ( - seq_lines[2].strip() - == "SEQ=+42.011LEM+15.995SLIM+15.995EK,+43.006PEN+0.984PTIQ+0.984DEK,decoy_-17.027KM+15.995EILSEL,decoy_+43.006-17.027KEDITEPP,decoy_KEDIQ+0.984TEPPQ+0.984" + seq_lines[2].strip() == "SEQ=+42.011LEM+15.995SLIM+15.995EK," + "+43.006PEN+0.984PTIQ+0.984DEK,decoy_-17.027KM+15.995EILSEL," + "decoy_+43.006-17.027KEDITEPP,decoy_KEDIQ+0.984TEPPQ+0.984" ) @@ -56,7 +57,7 @@ def test_db_search( mgf_small_unannotated, tide_dir_small, tiny_config, tmp_path, monkeypatch ): # Run a command: - monkeypatch.setattr(casanovo, "__version__", "4.1.1") + monkeypatch.setattr(casanovo, "__version__", "4.1.0") run = functools.partial( CliRunner().invoke, casanovo.main, catch_exceptions=False ) From ee6638e70c30e27727f7b409d70fa3672ca2ee11 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 7 May 2024 06:30:44 +0000 Subject: [PATCH 13/84] Generate new screengrabs with rich-codex --- docs/images/configure-help.svg | 64 ++++++------ docs/images/evaluate-help.svg | 123 +++++++++++------------ docs/images/help.svg | 169 +++++++++++++++---------------- docs/images/sequence-help.svg | 123 +++++++++++------------ docs/images/train-help.svg | 175 ++++++++++++++++----------------- 5 files changed, 307 insertions(+), 347 deletions(-) diff --git a/docs/images/configure-help.svg b/docs/images/configure-help.svg index d5dd7aa8..0822927a 100644 --- a/docs/images/configure-help.svg +++ b/docs/images/configure-help.svg @@ -19,63 +19,57 @@ font-weight: 700; } - .terminal-2285289330-matrix { + .terminal-3936755216-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-2285289330-title { + .terminal-3936755216-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-2285289330-r1 { fill: #c5c8c6 } -.terminal-2285289330-r2 { fill: #d0b344 } -.terminal-2285289330-r3 { fill: #c5c8c6;font-weight: bold } -.terminal-2285289330-r4 { fill: #68a0b3;font-weight: bold } -.terminal-2285289330-r5 { fill: #868887 } -.terminal-2285289330-r6 { fill: #98a84b;font-weight: bold } -.terminal-2285289330-r7 { fill: #d0b344;font-weight: bold } + .terminal-3936755216-r1 { fill: #c5c8c6 } - + - + - + - + - + - + - + - + - + - + - + - + @@ -87,21 +81,21 @@ - + - - $ casanovo configure --help - -Usage:casanovo configure [OPTIONS]                                             - - Generate a Casanovo configuration file to customize.                            - The casanovo configuration file is in the YAML format.                          - -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---output-oFILE  The output configuration file.                            ---help-h  Show this message and exit.                               -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo configure --help + + Usage: casanovo configure [OPTIONS]                                             + + Generate a Casanovo configuration file to customize.                            + The casanovo configuration file is in the YAML format.                          + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +│ --output  -o  FILE  The output configuration file.                           │ +│ --help    -h        Show this message and exit.                              │ +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/evaluate-help.svg b/docs/images/evaluate-help.svg index e220664b..b16c4ffd 100644 --- a/docs/images/evaluate-help.svg +++ b/docs/images/evaluate-help.svg @@ -19,108 +19,99 @@ font-weight: 700; } - .terminal-1788431117-matrix { + .terminal-1284026435-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-1788431117-title { + .terminal-1284026435-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-1788431117-r1 { fill: #c5c8c6 } -.terminal-1788431117-r2 { fill: #d0b344 } -.terminal-1788431117-r3 { fill: #c5c8c6;font-weight: bold } -.terminal-1788431117-r4 { fill: #68a0b3;font-weight: bold } -.terminal-1788431117-r5 { fill: #868887 } -.terminal-1788431117-r6 { fill: #cc555a } -.terminal-1788431117-r7 { fill: #d0b344;font-weight: bold } -.terminal-1788431117-r8 { fill: #8a4346 } -.terminal-1788431117-r9 { fill: #98a84b;font-weight: bold } -.terminal-1788431117-r10 { fill: #8d7b39;font-weight: bold } + .terminal-1284026435-r1 { fill: #c5c8c6 } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -132,35 +123,35 @@ - + - - $ casanovo evaluate --help - -Usage:casanovo evaluate [OPTIONSANNOTATED_PEAK_PATH...                       - - Evaluate de novo peptide sequencing performance.                                - ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       - provided by MassIVE-KB.                                                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  ANNOTATED_PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---model-mFILE                        The model weights (.ckpt file).  -                                              If not provided, Casanovo will   -                                              try to download the latest       -                                              release.                         ---output-oFILE                        The mzTab file to which results  -                                              will be written.                 ---config-cFILE                        The YAML configuration file      -                                              overriding the default options.  ---verbosity-v[debug|info|warning|error]  Set the verbosity of console     -                                              logging messages. Log files are  -                                              always set to 'debug'.           ---help-h  Show this message and exit.      -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo evaluate --help + + Usage: casanovo evaluate [OPTIONS] ANNOTATED_PEAK_PATH...                       + + Evaluate de novo peptide sequencing performance.                                + ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       + provided by MassIVE-KB.                                                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +│ *  ANNOTATED_PEAK_PATH    FILE  [required]                                   │ +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +│ --model      -m  FILE                        The model weights (.ckpt file). │ +│                                              If not provided, Casanovo will  │ +│                                              try to download the latest      │ +│                                              release.                        │ +│ --output     -o  FILE                        The mzTab file to which results │ +│                                              will be written.                │ +│ --config     -c  FILE                        The YAML configuration file     │ +│                                              overriding the default options. │ +│ --verbosity  -v  [debug|info|warning|error]  Set the verbosity of console    │ +│                                              logging messages. Log files are │ +│                                              always set to 'debug'.          │ +│ --help       -h                              Show this message and exit.     │ +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/help.svg b/docs/images/help.svg index 533b5f70..67dca83e 100644 --- a/docs/images/help.svg +++ b/docs/images/help.svg @@ -19,142 +19,135 @@ font-weight: 700; } - .terminal-1180240827-matrix { + .terminal-2658734560-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-1180240827-title { + .terminal-2658734560-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-1180240827-r1 { fill: #c5c8c6 } -.terminal-1180240827-r2 { fill: #d0b344 } -.terminal-1180240827-r3 { fill: #c5c8c6;font-weight: bold } -.terminal-1180240827-r4 { fill: #68a0b3;font-weight: bold } -.terminal-1180240827-r5 { fill: #d0b344;font-weight: bold } -.terminal-1180240827-r6 { fill: #608ab1;text-decoration: underline; } -.terminal-1180240827-r7 { fill: #868887 } -.terminal-1180240827-r8 { fill: #98a84b;font-weight: bold } + .terminal-2658734560-r1 { fill: #c5c8c6 } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -166,47 +159,47 @@ - + - - $ casanovo --help - -Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     - - ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  - ┃                                  Casanovo                                  ┃  - ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  - Casanovo de novo sequences peptides from tandem mass spectra using a            - Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   - de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  - training new models.                                                            - - Links:                                                                          - - • Documentation: https://casanovo.readthedocs.io - • Official code repository: https://github.com/Noble-Lab/casanovo - - If you use Casanovo in your work, please cite:                                  - - • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   -mass spectrometry peptide sequencing with a transformer model. Proceedings   -of the 39th International Conference on Machine Learning - ICML '22 (2022)   -doi:10.1101/2022.02.07.479481.                                               - -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---help-h    Show this message and exit.                                     -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Commands ───────────────────────────────────────────────────────────────────╮ -annotate  Annotate a given .mgf with candidates as selected by a Tide        - search for Casanovo-DB.                                            -configure Generate a Casanovo configuration file to customize.               -db-search Perform a search using Casanovo-DB.                                -evaluate  Evaluate de novo peptide sequencing performance.                   -sequence  De novo sequence peptides from tandem mass spectra.                -train     Train a Casanovo model on your own data.                           -version   Get the Casanovo version information                               -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo --help + + Usage: casanovo [OPTIONS] COMMAND [ARGS]...                                     + + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  + ┃                                  Casanovo                                  ┃  + ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  + Casanovo de novo sequences peptides from tandem mass spectra using a            + Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   + de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  + training new models.                                                            + + Links:                                                                          + +  • Documentation: https://casanovo.readthedocs.io                               +  • Official code repository: https://github.com/Noble-Lab/casanovo              + + If you use Casanovo in your work, please cite:                                  + +  • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   +    mass spectrometry peptide sequencing with a transformer model. Proceedings   +    of the 39th International Conference on Machine Learning - ICML '22 (2022)   +    doi:10.1101/2022.02.07.479481.                                               + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +│ --help  -h    Show this message and exit.                                    │ +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Commands ───────────────────────────────────────────────────────────────────╮ +│ annotate   Annotate a given .mgf with candidates as selected by a Tide       │ +│            search for Casanovo-DB.                                           │ +│ configure  Generate a Casanovo configuration file to customize.              │ +│ db-search  Perform a search using Casanovo-DB.                               │ +│ evaluate   Evaluate de novo peptide sequencing performance.                  │ +│ sequence   De novo sequence peptides from tandem mass spectra.               │ +│ train      Train a Casanovo model on your own data.                          │ +│ version    Get the Casanovo version information                              │ +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg index d493e2b2..f5799766 100644 --- a/docs/images/sequence-help.svg +++ b/docs/images/sequence-help.svg @@ -19,108 +19,99 @@ font-weight: 700; } - .terminal-2396407494-matrix { + .terminal-2359602172-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-2396407494-title { + .terminal-2359602172-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-2396407494-r1 { fill: #c5c8c6 } -.terminal-2396407494-r2 { fill: #d0b344 } -.terminal-2396407494-r3 { fill: #c5c8c6;font-weight: bold } -.terminal-2396407494-r4 { fill: #68a0b3;font-weight: bold } -.terminal-2396407494-r5 { fill: #868887 } -.terminal-2396407494-r6 { fill: #cc555a } -.terminal-2396407494-r7 { fill: #d0b344;font-weight: bold } -.terminal-2396407494-r8 { fill: #8a4346 } -.terminal-2396407494-r9 { fill: #98a84b;font-weight: bold } -.terminal-2396407494-r10 { fill: #8d7b39;font-weight: bold } + .terminal-2359602172-r1 { fill: #c5c8c6 } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -132,35 +123,35 @@ - + - - $ casanovo sequence --help - -Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 - - De novo sequence peptides from tandem mass spectra.                             - PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence  - peptides.                                                                       - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---model-mFILE                        The model weights (.ckpt file).  -                                              If not provided, Casanovo will   -                                              try to download the latest       -                                              release.                         ---output-oFILE                        The mzTab file to which results  -                                              will be written.                 ---config-cFILE                        The YAML configuration file      -                                              overriding the default options.  ---verbosity-v[debug|info|warning|error]  Set the verbosity of console     -                                              logging messages. Log files are  -                                              always set to 'debug'.           ---help-h  Show this message and exit.      -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo sequence --help + + Usage: casanovo sequence [OPTIONS] PEAK_PATH...                                 + + De novo sequence peptides from tandem mass spectra.                             + PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence  + peptides.                                                                       + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +│ *  PEAK_PATH    FILE  [required]                                             │ +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +│ --model      -m  FILE                        The model weights (.ckpt file). │ +│                                              If not provided, Casanovo will  │ +│                                              try to download the latest      │ +│                                              release.                        │ +│ --output     -o  FILE                        The mzTab file to which results │ +│                                              will be written.                │ +│ --config     -c  FILE                        The YAML configuration file     │ +│                                              overriding the default options. │ +│ --verbosity  -v  [debug|info|warning|error]  Set the verbosity of console    │ +│                                              logging messages. Log files are │ +│                                              always set to 'debug'.          │ +│ --help       -h                              Show this message and exit.     │ +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/train-help.svg b/docs/images/train-help.svg index 82c30122..fccd4140 100644 --- a/docs/images/train-help.svg +++ b/docs/images/train-help.svg @@ -19,147 +19,138 @@ font-weight: 700; } - .terminal-3340932753-matrix { + .terminal-2430201580-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-3340932753-title { + .terminal-2430201580-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-3340932753-r1 { fill: #c5c8c6 } -.terminal-3340932753-r2 { fill: #d0b344 } -.terminal-3340932753-r3 { fill: #c5c8c6;font-weight: bold } -.terminal-3340932753-r4 { fill: #68a0b3;font-weight: bold } -.terminal-3340932753-r5 { fill: #868887 } -.terminal-3340932753-r6 { fill: #cc555a } -.terminal-3340932753-r7 { fill: #d0b344;font-weight: bold } -.terminal-3340932753-r8 { fill: #8a4346 } -.terminal-3340932753-r9 { fill: #98a84b;font-weight: bold } -.terminal-3340932753-r10 { fill: #8d7b39;font-weight: bold } + .terminal-2430201580-r1 { fill: #c5c8c6 } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -171,48 +162,48 @@ - + - - $ casanovo train --help - -Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              - - Train a Casanovo model on your own data.                                        - TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  - by MassIVE-KB, from which to train a new Casnovo model.                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  TRAIN_PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -*--validation_peak_pa…-pFILE                    An annotated MGF file   -                                                       for validation, like    -                                                       from MassIVE-KB. Use    -                                                       this option multiple    -                                                       times to specify        -                                                       multiple files.         -[required]             ---model-mFILE                    The model weights       -                                                       (.ckpt file). If not    -                                                       provided, Casanovo      -                                                       will try to download    -                                                       the latest release.     ---output-oFILE                    The mzTab file to       -                                                       which results will be   -                                                       written.                ---config-cFILE                    The YAML configuration  -                                                       file overriding the     -                                                       default options.        ---verbosity-v[debug|info|warning|er  Set the verbosity of    -ror]  console logging         -                                                       messages. Log files     -                                                       are always set to       -                                                       'debug'.                ---help-h  Show this message and   -                                                       exit.                   -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo train --help + + Usage: casanovo train [OPTIONS] TRAIN_PEAK_PATH...                              + + Train a Casanovo model on your own data.                                        + TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  + by MassIVE-KB, from which to train a new Casnovo model.                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +│ *  TRAIN_PEAK_PATH    FILE  [required]                                       │ +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +│ *  --validation_peak_pa…  -p  FILE                    An annotated MGF file  │ +│                                                       for validation, like   │ +│                                                       from MassIVE-KB. Use   │ +│                                                       this option multiple   │ +│                                                       times to specify       │ +│                                                       multiple files.        │ +│                                                       [required]             │ +│    --model                -m  FILE                    The model weights      │ +│                                                       (.ckpt file). If not   │ +│                                                       provided, Casanovo     │ +│                                                       will try to download   │ +│                                                       the latest release.    │ +│    --output               -o  FILE                    The mzTab file to      │ +│                                                       which results will be  │ +│                                                       written.               │ +│    --config               -c  FILE                    The YAML configuration │ +│                                                       file overriding the    │ +│                                                       default options.       │ +│    --verbosity            -v  [debug|info|warning|er  Set the verbosity of   │ +│                               ror]                    console logging        │ +│                                                       messages. Log files    │ +│                                                       are always set to      │ +│                                                       'debug'.               │ +│    --help                 -h                          Show this message and  │ +│                                                       exit.                  │ +╰──────────────────────────────────────────────────────────────────────────────╯ + From 2d57513fbcfed50e848ca47b9f82ac68baa884a2 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 7 May 2024 12:05:34 -0700 Subject: [PATCH 14/84] justins requested fixes --- casanovo/config.yaml | 1 - casanovo/data/annotate_db.py | 15 +++++++++++---- casanovo/denovo/dataloaders.py | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 7b207664..a92a7ffa 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -16,7 +16,6 @@ isotope_error_range: [0, 1] # The minimum length of predicted peptides. min_peptide_len: 6 # Number of spectra in one inference batch. -# Also the number of PSMs processed by Casanovo-DB per batch. predict_batch_size: 1024 # Number of beams used in beam search. n_beams: 1 diff --git a/casanovo/data/annotate_db.py b/casanovo/data/annotate_db.py index 4f33b798..dd2e6c64 100644 --- a/casanovo/data/annotate_db.py +++ b/casanovo/data/annotate_db.py @@ -38,8 +38,10 @@ def _normalize_mods(seq: str) -> str: def annotate_mgf(peak_path: str, tide_path: str, output: Optional[str]): """ - Accepts a directory containing the results of a successful tide search, and an .mgf file containing MS/MS spectra. - The .mgf file is then annotated in the SEQ field with all of the candidate peptides for each spectrum, as well as their target/decoy status. + Accepts a directory containing the results of a successful tide search, + and an .mgf file containing MS/MS spectra. + The .mgf file is then annotated in the SEQ field with + all of the candidate peptides for each spectrum, as well as their target/decoy status. This annotated .mgf can be given directly to Casanovo-DB to perfrom a database search. Parameters @@ -104,7 +106,11 @@ def annotate_mgf(peak_path: str, tide_path: str, output: Optional[str]): decoy_candidate_list = [] logger.warn(f"No decoy peptides found for scan {scan}.") - scan_map[scan] = target_candidate_list + decoy_candidate_list + pep_list = target_candidate_list + decoy_candidate_list + if len(pep_list) == 0: + logger.warn(f"No peptides found for scan {scan}.") + else: + scan_map[scan] = target_candidate_list + decoy_candidate_list all_spec = [] for idx, spec_dict in enumerate(mgf.read(peak_path)): @@ -112,7 +118,8 @@ def annotate_mgf(peak_path: str, tide_path: str, output: Optional[str]): scan = int(spec_dict["params"]["scans"]) except KeyError as e: logger.error( - "Could not find the scan number in the .mgf file. Please ensure that the .mgf file contains the scan number in the 'SCANS' field." + "Could not find the scan number in the .mgf file." + "Please ensure that the .mgf file contains the scan number in the 'SCANS' field." ) raise e try: diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index efb346ab..284aaeb7 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -126,7 +126,7 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: ) if self.test_index is not None: self.test_dataset = make_dataset(self.test_index) - if stage in (None, "db"): + if stage == "db": make_dataset = functools.partial( DBSpectrumDataset, n_peaks=self.n_peaks, From 3cfb7954a20cf2dd8ba79245db3291b1e2eaab3b Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Mon, 17 Jun 2024 12:11:24 -0700 Subject: [PATCH 15/84] added minor changes as requested by Wout --- casanovo/casanovo.py | 34 +++++++++++++++++------ casanovo/config.yaml | 2 +- casanovo/data/datasets.py | 5 ++-- casanovo/data/ms_io.py | 4 +-- casanovo/denovo/dataloaders.py | 4 +-- casanovo/denovo/model.py | 48 ++++++++++++++++----------------- casanovo/denovo/model_runner.py | 6 ++--- tests/test_integration.py | 8 +----- 8 files changed, 61 insertions(+), 50 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index c2bce3ef..7db5faa8 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -146,7 +146,7 @@ def sequence( logger.info("DONE!") -@main.command(cls=_SharedParams) +@main.command() @click.argument( "peak_path", required=True, @@ -159,11 +159,28 @@ def sequence( nargs=1, type=click.Path(exists=True, dir_okay=True), ) +@click.option( + "-o", + "--output", + help="The output annotated MGF file.", + type=click.Path(dir_okay=False), +) +@click.option( + "-v", + "--verbosity", + help=""" + Set the verbosity of console logging messages. Log files are + always set to 'debug'. + """, + type=click.Choice( + ["debug", "info", "warning", "error"], + case_sensitive=False, + ), + default="info", +) def annotate( peak_path: str, tide_path: str, - model: Optional[str], - config: Optional[str], output: Optional[str], verbosity: str, ) -> None: @@ -174,13 +191,12 @@ def annotate( TIDE_PATH must be one directory containing the Tide search results of the .mgf. This directory must contain tide-search.decoy.txt and tide-search.target.txt """ - for peak_file in peak_path: - logger.info(" %s", peak_file) - if output is None: output = setup_logging(output, verbosity) logger.info( - "Output file not specified. Annotated MGF will be saved in the same directory as the input MGF." + "Output file not specified. \ + Annotated MGF will be saved in the same directory \ + as the input MGF." ) output = peak_path.replace(".mgf", "_annotated.mgf") else: @@ -207,11 +223,13 @@ def db_search( ) -> None: """Perform a search using Casanovo-DB. - PEAK_PATH must be one MGF file that has ANNOTATED spectra, as output by annotate mode. + PEAK_PATH must be one MGF file that has ANNOTATED spectra, + as output by annotate mode. """ output = setup_logging(output, verbosity) config, model = setup_model(model, config, output, False) with ModelRunner(config, model) as runner: + logger.info("DB-searching peptides from: %s", peak_path) runner.db_search(peak_path, output) logger.info("DONE!") diff --git a/casanovo/config.yaml b/casanovo/config.yaml index a92a7ffa..c7186ff7 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -15,7 +15,7 @@ precursor_mass_tol: 50 # ppm isotope_error_range: [0, 1] # The minimum length of predicted peptides. min_peptide_len: 6 -# Number of spectra in one inference batch. +# Number of spectra in one inference batch. predict_batch_size: 1024 # Number of beams used in beam search. n_beams: 1 diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py index d0c6f347..665d69e0 100644 --- a/casanovo/data/datasets.py +++ b/casanovo/data/datasets.py @@ -267,9 +267,10 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, float, int, str]: return spectrum, precursor_mz, precursor_charge, peptide -class DBSpectrumDataset(AnnotatedSpectrumDataset): +class DbSpectrumDataset(AnnotatedSpectrumDataset): """ - Parse and retrieve collections of annotated MS/MS spectra, additionally keep track of spectrum ids for Casanovo-DB. + Parse and retrieve collections of annotated MS/MS spectra, + additionally keep track of spectrum ids for Casanovo-DB. Parameters ---------- diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index ae4f3b54..de12e768 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -254,7 +254,7 @@ def save(self) -> None: "start", "end", "opt_ms_run[1]_aa_scores", - "opt_target", + "opt_cv_MS:1002217_decoy_peptide", ] ) for i, psm in enumerate(self.psms): @@ -287,6 +287,6 @@ def save(self) -> None: ) ) ), # opt_ms_run[1]_aa_scores - bool(psm[7]), # opt_target + bool(psm[7]), # opt_cv_MS:1002217_decoy_peptide ] ) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 284aaeb7..f9865572 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -12,7 +12,7 @@ from ..data.datasets import ( AnnotatedSpectrumDataset, SpectrumDataset, - DBSpectrumDataset, + DbSpectrumDataset, ) @@ -128,7 +128,7 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: self.test_dataset = make_dataset(self.test_index) if stage == "db": make_dataset = functools.partial( - DBSpectrumDataset, + DbSpectrumDataset, n_peaks=self.n_peaks, min_mz=self.min_mz, max_mz=self.max_mz, diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index ec234691..6c440ce0 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -989,17 +989,16 @@ def configure_optimizers( return [optimizer], {"scheduler": lr_scheduler, "interval": "step"} -class DBSpec2Pep(Spec2Pep): +class DbSpec2Pep(Spec2Pep): """ Inherits Spec2Pep Hijacks teacher-forcing implemented in Spec2Pep and uses it to predict scores between a spectra and associated peptide. - Input format is .mgf, with comma-separated targets - and decoys in the SEQ field. Decoys should have a prefix of "decoy_". + Decoys should have a prefix of "decoy_". """ - num_pairs = None # Modified to be predict_batch_size from config + num_pairs = None def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1014,11 +1013,8 @@ def predict_step(self, batch, *args): encoded_ms, ) in self.smart_batch_gen(batch): pred, truth = self.decoder(peptides, precursors, *encoded_ms) - sm = torch.nn.Softmax(dim=2) - pred = sm(pred) - score_result, per_aa_score = _calc_match_score( - pred, truth - ) # Calculate the score between spectra + peptide list + pred = self.softmax(pred) + score_result, per_aa_score = _calc_match_score(pred, truth) batch_res.append( ( indexes, @@ -1122,29 +1118,31 @@ def _calc_match_score( batch_all_aa_scores: torch.Tensor, truth_aa_indicies: torch.Tensor ) -> List[float]: """ + Calculate the score between the input spectra and associated peptide. + Take in teacher-forced scoring of amino acids of the peptides (in a batch) and use the truth labels to calculate a score between the input spectra and associated peptide. The score is the geometric mean of the AA probabilities - Parameters - ---------- - batch_all_aa_scores : torch.Tensor - Amino acid scores for all amino acids in - the vocabulary for every prediction made to generate - the associated peptide (for an entire batch) - truth_aa_indicies : torch.Tensor - Indicies of the score for each actual amino acid - in the peptide (for an entire batch) + Parameters + ---------- + batch_all_aa_scores : torch.Tensor + Amino acid scores for all amino acids in + the vocabulary for every prediction made to generate + the associated peptide (for an entire batch) + truth_aa_indicies : torch.Tensor + Indicies of the score for each actual amino acid + in the peptide (for an entire batch) - Returns - ------- - score : list[float], list[list[float]] - The score between the input spectra and associated peptide - (for an entire batch) - a list of lists of per amino acid scores - (for an entire batch) + Returns + ------- + score : list[float], list[list[float]] + The score between the input spectra and associated peptide + (for an entire batch) + a list of lists of per amino acid scores + (for an entire batch) """ # Remove trailing tokens from predictions, batch_all_aa_scores = batch_all_aa_scores[:, :-1] diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index e6956049..fd17378f 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -20,7 +20,7 @@ from ..config import Config from ..data import ms_io from ..denovo.dataloaders import DeNovoDataModule -from ..denovo.model import Spec2Pep, DBSpec2Pep +from ..denovo.model import Spec2Pep, DbSpec2Pep logger = logging.getLogger("casanovo") @@ -294,7 +294,7 @@ def initialize_db_model(self) -> None: # the provided configuration. device = torch.empty(1).device # Use the default device. try: - self.model = DBSpec2Pep.load_from_checkpoint( + self.model = DbSpec2Pep.load_from_checkpoint( self.model_filename, map_location=device, **loaded_model_params ) @@ -315,7 +315,7 @@ def initialize_db_model(self) -> None: except RuntimeError: # This only doesn't work if the weights are from an older version try: - self.model = DBSpec2Pep.load_from_checkpoint( + self.model = DbSpec2Pep.load_from_checkpoint( self.model_filename, map_location=device, **model_params, diff --git a/tests/test_integration.py b/tests/test_integration.py index 3ad1a4f4..aacc6d15 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -7,9 +7,7 @@ from casanovo import casanovo -def test_annotate( - mgf_small_unannotated, tide_dir_small, tiny_config, tmp_path -): +def test_annotate(mgf_small_unannotated, tide_dir_small, tmp_path): # Run a command: run = functools.partial( @@ -20,8 +18,6 @@ def test_annotate( "annotate", str(mgf_small_unannotated), str(tide_dir_small), - "--config", - tiny_config, "--output", str(tmp_path / "annotated_mgf.mgf"), ] @@ -66,8 +62,6 @@ def test_db_search( "annotate", str(mgf_small_unannotated), str(tide_dir_small), - "--config", - tiny_config, "--output", str(tmp_path / "annotated_mgf.mgf"), ] From 49f44ada6452a2769d0e8bddd04f03995f7a8a1c Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Mon, 17 Jun 2024 17:15:14 -0700 Subject: [PATCH 16/84] partial fixes requested by wout. Lots of subclassing removed --- casanovo/data/datasets.py | 84 +++--------------- casanovo/data/ms_io.py | 27 +++--- casanovo/denovo/dataloaders.py | 42 ++------- casanovo/denovo/model.py | 12 +-- casanovo/denovo/model_runner.py | 152 +++++++++----------------------- tests/test_integration.py | 2 +- tests/unit_tests/test_unit.py | 46 ++++++++-- 7 files changed, 121 insertions(+), 244 deletions(-) diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py index 665d69e0..aff6af85 100644 --- a/casanovo/data/datasets.py +++ b/casanovo/data/datasets.py @@ -212,6 +212,8 @@ class AnnotatedSpectrumDataset(SpectrumDataset): random_state : Optional[int] The NumPy random state. ``None`` leaves mass spectra in the order they were parsed. + track_spectrum_id : Optional[bool] + Whether to keep track of the identifier of the MS/MS spectra. """ def __init__( @@ -223,6 +225,7 @@ def __init__( min_intensity: float = 0.01, remove_precursor_tol: float = 2.0, random_state: Optional[int] = None, + track_spectrum_id: Optional[bool] = False, ): super().__init__( annotated_spectrum_index, @@ -233,6 +236,7 @@ def __init__( remove_precursor_tol=remove_precursor_tol, random_state=random_state, ) + self.track_spectrum_id = track_spectrum_id def __getitem__(self, idx: int) -> Tuple[torch.Tensor, float, int, str]: """ @@ -264,76 +268,12 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, float, int, str]: spectrum = self._process_peaks( mz_array, int_array, precursor_mz, precursor_charge ) + if self.track_spectrum_id: + return ( + spectrum, + precursor_mz, + precursor_charge, + peptide, + self.get_spectrum_id(idx), + ) return spectrum, precursor_mz, precursor_charge, peptide - - -class DbSpectrumDataset(AnnotatedSpectrumDataset): - """ - Parse and retrieve collections of annotated MS/MS spectra, - additionally keep track of spectrum ids for Casanovo-DB. - - Parameters - ---------- - annotated_spectrum_index : depthcharge.data.SpectrumIndex - The MS/MS spectra to use as a dataset. - n_peaks : Optional[int] - The number of top-n most intense peaks to keep in each spectrum. `None` - retains all peaks. - min_mz : float - The minimum m/z to include. The default is 140 m/z, in order to exclude - TMT and iTRAQ reporter ions. - max_mz : float - The maximum m/z to include. - min_intensity : float - Remove peaks whose intensity is below `min_intensity` percentage of the - base peak intensity. - remove_precursor_tol : float - Remove peaks within the given mass tolerance in Dalton around the - precursor mass. - random_state : Optional[int] - The NumPy random state. ``None`` leaves mass spectra in the order they - were parsed. - """ - - def __getitem__( - self, idx: int - ) -> Tuple[torch.Tensor, float, int, str, Tuple[str, str]]: - """ - Return the annotated MS/MS spectrum with the given index. - - Parameters - ---------- - idx : int - The index of the spectrum to return. - - Returns - ------- - spectrum : torch.Tensor of shape (n_peaks, 2) - A tensor of the spectrum with the m/z and intensity peak values. - precursor_mz : float - The precursor m/z. - precursor_charge : int - The precursor charge. - annotation : str - The peptide annotation of the spectrum. - spectrum_id: Tuple[str, str] - The unique spectrum identifier, formed by its original peak file and - identifier (index or scan number) therein. - """ - ( - mz_array, - int_array, - precursor_mz, - precursor_charge, - peptide, - ) = self.index[idx] - spectrum = self._process_peaks( - mz_array, int_array, precursor_mz, precursor_charge - ) - return ( - spectrum, - precursor_mz, - precursor_charge, - peptide, - self.get_spectrum_id(idx), - ) diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index de12e768..c4cfc7cb 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -22,10 +22,13 @@ class MztabWriter: ---------- filename : str The name of the mzTab file. + is_db_variant : bool + Whether the mzTab file is for a Casanovo-DB search. """ - def __init__(self, filename: str): + def __init__(self, filename: str, is_db_variant: bool = False): self.filename = filename + self.is_db_variant = is_db_variant self.metadata = [ ("mzTab-version", "1.0.0"), ("mzTab-mode", "Summary"), @@ -147,6 +150,9 @@ def save(self) -> None: """ Export the spectrum identifications to the mzTab file. """ + if self.is_db_variant: + self.save_db_variant() + return with open(self.filename, "w", newline="") as f: writer = csv.writer(f, delimiter="\t", lineterminator=os.linesep) # Write metadata. @@ -210,21 +216,12 @@ def save(self) -> None: ] ) - -class DBWriter(MztabWriter): - """ - Export DB search results to an mzTab file. - - Parameters - ---------- - filename : str - The name of the mzTab file. - """ - - def save(self) -> None: + def save_db_variant(self) -> None: """ - Export the DB search results to the mzTab file. - Outputs PSMs in the order they were scored (i.e. the order in the annotated .mgf file). + Export the Casanovo-DB search results to the mzTab file. + + Outputs PSMs in the order they were scored + (i.e. the order in the annotated .mgf file). """ with open(self.filename, "w", newline="") as f: writer = csv.writer(f, delimiter="\t", lineterminator=os.linesep) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index f9865572..6731e532 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -12,7 +12,6 @@ from ..data.datasets import ( AnnotatedSpectrumDataset, SpectrumDataset, - DbSpectrumDataset, ) @@ -128,12 +127,13 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: self.test_dataset = make_dataset(self.test_index) if stage == "db": make_dataset = functools.partial( - DbSpectrumDataset, + AnnotatedSpectrumDataset, n_peaks=self.n_peaks, min_mz=self.min_mz, max_mz=self.max_mz, min_intensity=self.min_intensity, remove_precursor_tol=self.remove_precursor_tol, + track_spectrum_id=True, ) if self.test_index is not None: self.test_dataset = make_dataset(self.test_index) @@ -143,6 +143,7 @@ def _make_loader( dataset: torch.utils.data.Dataset, batch_size: int, shuffle: bool = False, + db_mode: bool = False, ) -> torch.utils.data.DataLoader: """ Create a PyTorch DataLoader. @@ -155,6 +156,8 @@ def _make_loader( The batch size to use. shuffle : bool Option to shuffle the batches. + db_mode : bool + Option to use the DataLoader for Casanovo-DB. Returns ------- @@ -164,41 +167,12 @@ def _make_loader( return torch.utils.data.DataLoader( dataset, batch_size=batch_size, - collate_fn=prepare_batch, + collate_fn=prepare_batch if not db_mode else prepare_db_batch, pin_memory=True, num_workers=self.n_workers, shuffle=shuffle, ) - def _make_db_loader( - self, dataset: torch.utils.data.Dataset, batch_size: int - ) -> torch.utils.data.DataLoader: - """ - Create a PyTorch DataLoader. - - Parameters - ---------- - dataset : torch.utils.data.Dataset - A PyTorch Dataset. - - Returns - ------- - torch.utils.data.DataLoader - A PyTorch DataLoader. - """ - # Calculate new batch size to saturate previous batch size with PSMs - pep_per_spec = [] - for i in range(min(10, len(dataset))): - pep_per_spec.append(len(dataset[i][3].split(","))) - new_batch_size = max(1, int(batch_size // np.mean(pep_per_spec))) - return torch.utils.data.DataLoader( - dataset, - batch_size=new_batch_size, - collate_fn=prepare_db_batch, - pin_memory=True, - num_workers=self.n_workers, - ) - def train_dataloader(self) -> torch.utils.data.DataLoader: """Get the training DataLoader.""" return self._make_loader( @@ -219,7 +193,9 @@ def predict_dataloader(self) -> torch.utils.data.DataLoader: def db_dataloader(self) -> torch.utils.data.DataLoader: """Get the predict DataLoader.""" - return self._make_db_loader(self.test_dataset, self.eval_batch_size) + return self._make_loader( + self.test_dataset, self.eval_batch_size, db_mode=True + ) def prepare_batch( diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 6c440ce0..31d90e24 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1157,18 +1157,18 @@ def _calc_match_score( per_aa_scores = batch_all_aa_scores[rows, cols, truth_aa_indicies] + per_aa_scores[per_aa_scores == 0] += 1e-10 score_mask = truth_aa_indicies != 0 - masked_per_aa_scores = per_aa_scores * score_mask - # Arithmetic score that was used before - ## all_scores = masked_per_aa_scores.sum(dim=1) / score_mask.sum(dim=1) + per_aa_scores[~score_mask] = 0 + log_per_aa_scores = torch.log(per_aa_scores) all_scores = torch.where( - torch.log(masked_per_aa_scores) == float("-inf"), + log_per_aa_scores == float("-inf"), torch.tensor(0.0), - torch.log(masked_per_aa_scores), + log_per_aa_scores, ).sum(dim=1) / score_mask.sum( dim=1 ) # Calculates geometric score - return all_scores, masked_per_aa_scores + return all_scores, per_aa_scores class CosineWarmupScheduler(torch.optim.lr_scheduler._LRScheduler): diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index fd17378f..14aebf8d 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -93,7 +93,9 @@ def db_search(self, peak_path: Iterable[str], output: str) -> None: ------- self """ - self.writer = ms_io.DBWriter(Path(output).with_suffix(".mztab")) + self.writer = ms_io.MztabWriter( + Path(output).with_suffix(".mztab"), is_db_variant=True + ) self.writer.set_metadata( self.config, model=str(self.model_filename), @@ -101,7 +103,7 @@ def db_search(self, peak_path: Iterable[str], output: str) -> None: ) self.initialize_trainer(train=True) - self.initialize_db_model() + self.initialize_model(train=False, db_search=True) self.model.out_writer = self.writer test_index = self._get_index(peak_path, True, "db search") @@ -229,106 +231,9 @@ def initialize_trainer(self, train: bool) -> None: self.trainer = pl.Trainer(**trainer_cfg) - def initialize_db_model(self) -> None: - """Initialize the Casanovo-DB model. - Required because the DB search model is a unique subclass of the Spec2Pep model. - """ - model_params = dict( - dim_model=self.config.dim_model, - n_head=self.config.n_head, - dim_feedforward=self.config.dim_feedforward, - n_layers=self.config.n_layers, - dropout=self.config.dropout, - dim_intensity=self.config.dim_intensity, - max_length=self.config.max_length, - residues=self.config.residues, - max_charge=self.config.max_charge, - precursor_mass_tol=self.config.precursor_mass_tol, - isotope_error_range=self.config.isotope_error_range, - min_peptide_len=self.config.min_peptide_len, - n_beams=self.config.n_beams, - top_match=self.config.top_match, - n_log=self.config.n_log, - tb_summarywriter=self.config.tb_summarywriter, - train_label_smoothing=self.config.train_label_smoothing, - warmup_iters=self.config.warmup_iters, - cosine_schedule_period_iters=self.config.cosine_schedule_period_iters, - lr=self.config.learning_rate, - weight_decay=self.config.weight_decay, - out_writer=self.writer, - calculate_precision=self.config.calculate_precision, - ) - - # Reconfigurable non-architecture related parameters for a loaded model. - loaded_model_params = dict( - max_length=self.config.max_length, - precursor_mass_tol=self.config.precursor_mass_tol, - isotope_error_range=self.config.isotope_error_range, - n_beams=self.config.n_beams, - min_peptide_len=self.config.min_peptide_len, - top_match=self.config.top_match, - n_log=self.config.n_log, - tb_summarywriter=self.config.tb_summarywriter, - train_label_smoothing=self.config.train_label_smoothing, - warmup_iters=self.config.warmup_iters, - cosine_schedule_period_iters=self.config.cosine_schedule_period_iters, - lr=self.config.learning_rate, - weight_decay=self.config.weight_decay, - out_writer=self.writer, - calculate_precision=self.config.calculate_precision, - ) - - # Model file must exist for DB search - if self.model_filename is None: - logger.error("A model file must be provided") - raise ValueError("A model file must be provided") - - if not Path(self.model_filename).exists(): - logger.error( - "Could not find the model weights at file %s", - self.model_filename, - ) - raise FileNotFoundError("Could not find the model weights file") - - # First try loading model details from the weights file, otherwise use - # the provided configuration. - device = torch.empty(1).device # Use the default device. - try: - self.model = DbSpec2Pep.load_from_checkpoint( - self.model_filename, map_location=device, **loaded_model_params - ) - - # Pass in information about predict_batch_size to the model for batch saturation - self.model.num_pairs = self.config.predict_batch_size - - architecture_params = set(model_params.keys()) - set( - loaded_model_params.keys() - ) - for param in architecture_params: - if model_params[param] != self.model.hparams[param]: - warnings.warn( - f"Mismatching {param} parameter in " - f"model checkpoint ({self.model.hparams[param]}) " - f"vs config file ({model_params[param]}); " - "using the checkpoint." - ) - except RuntimeError: - # This only doesn't work if the weights are from an older version - try: - self.model = DbSpec2Pep.load_from_checkpoint( - self.model_filename, - map_location=device, - **model_params, - ) - # Pass in information about predict_batch_size to the model for batch saturation - self.model.num_pairs = self.config.predict_batch_size - except RuntimeError: - raise RuntimeError( - "Weights file incompatible with the current version of " - "Casanovo." - ) - - def initialize_model(self, train: bool) -> None: + def initialize_model( + self, train: bool, db_search: Optional[bool] = False + ) -> None: """Initialize the Casanovo model. Parameters @@ -336,6 +241,8 @@ def initialize_model(self, train: bool) -> None: train : bool Determines whether to set the model up for model training or evaluation / inference. + db_search : Optional[bool] + Determines whether to use the DB search model subclass. """ model_params = dict( dim_model=self.config.dim_model, @@ -385,6 +292,11 @@ def initialize_model(self, train: bool) -> None: if self.model_filename is None: # Train a model from scratch if no model file is provided. if train: + if db_search: + logger.error("Db search mode requires a model file.") + raise ValueError( + "A model file must be provided for DB search mode" + ) self.model = Spec2Pep(**model_params) return # Else we're not training, so a model file must be provided. @@ -404,9 +316,20 @@ def initialize_model(self, train: bool) -> None: # the provided configuration. device = torch.empty(1).device # Use the default device. try: - self.model = Spec2Pep.load_from_checkpoint( - self.model_filename, map_location=device, **loaded_model_params - ) + if db_search: + self.model = DbSpec2Pep.load_from_checkpoint( + self.model_filename, + map_location=device, + **loaded_model_params, + ) + ## TODO move? + self.model.num_pairs = self.config.predict_batch_size + else: + self.model = Spec2Pep.load_from_checkpoint( + self.model_filename, + map_location=device, + **loaded_model_params, + ) architecture_params = set(model_params.keys()) - set( loaded_model_params.keys() @@ -422,11 +345,20 @@ def initialize_model(self, train: bool) -> None: except RuntimeError: # This only doesn't work if the weights are from an older version try: - self.model = Spec2Pep.load_from_checkpoint( - self.model_filename, - map_location=device, - **model_params, - ) + if db_search: + self.model = DbSpec2Pep.load_from_checkpoint( + self.model_filename, + map_location=device, + **model_params, + ) + ## TODO move? + self.model.num_pairs = self.config.predict_batch_size + else: + self.model = Spec2Pep.load_from_checkpoint( + self.model_filename, + map_location=device, + **model_params, + ) except RuntimeError: raise RuntimeError( "Weights file incompatible with the current version of " diff --git a/tests/test_integration.py b/tests/test_integration.py index aacc6d15..60e3977b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -108,7 +108,7 @@ def test_db_search( "+43.006-17.027KEDITEPP", "KEDIQ+0.984TEPPQ+0.984", ] - assert list(psms.opt_target) == [ + assert list(psms["opt_cv_MS:1002217_decoy_peptide"]) == [ "True", "True", "False", diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index bcc61446..ec9085c0 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -153,6 +153,11 @@ def test_calc_match_score(): stop_slot_prob = torch.zeros(29) stop_slot_prob[28] = 1.0 # $ blank_slot_prob = torch.zeros(29) + blank_slot_prob[0] = 0.42 # Should never come into play + fourth_slot_prob = torch.zeros(29) + fourth_slot_prob[4] = 0.5 # D + fifth_slot_prob = torch.zeros(29) + fifth_slot_prob[5] = 0.5 # E pep_1_aa = torch.stack( [ @@ -172,19 +177,46 @@ def test_calc_match_score(): blank_slot_prob, ] ) - - batch_all_aa_scores = torch.stack([pep_1_aa, pep_2_aa]) - truth_aa_indices = torch.tensor([[1, 2, 3, 28], [3, 2, 28, 0]]) + pep_3_aa = torch.stack( + [ + fourth_slot_prob, + fifth_slot_prob, + first_slot_prob, + stop_slot_prob, + blank_slot_prob, + ] + ) + pep_4_aa = torch.stack( + [ + first_slot_prob, + second_slot_prob, + third_slot_prob, + stop_slot_prob, + blank_slot_prob, + ] + ) + batch_all_aa_scores = torch.stack([pep_1_aa, pep_2_aa, pep_3_aa, pep_4_aa]) + truth_aa_indices = torch.tensor( + [[1, 2, 3, 28], [3, 2, 28, 0], [4, 5, 1, 28], [2, 2, 3, 28]] + ) all_scores, masked_per_aa_scores = _calc_match_score( batch_all_aa_scores, truth_aa_indices ) - assert all_scores.numpy()[0] == pytest.approx(0) - assert all_scores.numpy()[1] == pytest.approx(0) + assert all_scores.numpy()[0] == 0 + assert all_scores.numpy()[1] == 0 + assert all_scores.numpy()[2] == pytest.approx( + np.log(0.5 * 0.5 * 1 * 1) / 4 + ) + assert all_scores.numpy()[3] == pytest.approx( + np.log(1e-10 * 1 * 1 * 1) / 4 + ) - assert np.sum(masked_per_aa_scores.numpy()[0]) == pytest.approx(4) - assert np.sum(masked_per_aa_scores.numpy()[1]) == pytest.approx(3) + assert np.sum(masked_per_aa_scores.numpy()[0]) == 4 + assert np.sum(masked_per_aa_scores.numpy()[1]) == 3 + assert np.sum(masked_per_aa_scores.numpy()[2]) == 3 + assert np.sum(masked_per_aa_scores.numpy()[3]) == 3 def test_beam_search_decode(): From d967c4218bf6f84d25e87e874c807e11c958cfbb Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 18 Jun 2024 12:25:09 -0700 Subject: [PATCH 17/84] documentation fixes and starting to cleanup batching code --- casanovo/denovo/dataloaders.py | 9 +++--- casanovo/denovo/model.py | 53 +++++++++++++++++++++++---------- casanovo/denovo/model_runner.py | 4 --- 3 files changed, 43 insertions(+), 23 deletions(-) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 6731e532..aff860a1 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -245,11 +245,12 @@ def prepare_db_batch( Parameters ---------- - batch : List[Tuple[torch.Tensor, float, int, str, Tuple[str, str]]] + batch : List[Tuple[torch.Tensor, Tuple[float, int, float], str, Tuple[str, str]]] A batch of data from an AnnotatedSpectrumDataset, consisting of for each - spectrum (i) a tensor with the m/z and intensity peak values, (ii), the - precursor m/z, (iii) the precursor charge, (iv) the spectrum identifier (peptide), (v) - spectrum identifiers (file and scan). + spectrum (i) a tensor with the m/z and intensity peak values, + (ii) the precursor information [mass, charge, m/z], (iii) the + peptide sequence, the precursor m/z, (iv) spectrum identifiers + (file and scan). Returns ------- diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 31d90e24..9dcb3e7e 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -995,30 +995,45 @@ class DbSpec2Pep(Spec2Pep): Hijacks teacher-forcing implemented in Spec2Pep and uses it to predict scores between a spectra and associated peptide. - Decoys should have a prefix of "decoy_". """ - num_pairs = None - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def predict_step(self, batch, *args): + """ + A single prediction step for Casanovo-DB + + Parameters + ---------- + batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor] + A batch of (i) MS/MS spectra, (ii) precursor information, (iii) + spectrum identifiers as torch Tensors, (iv) scan numbers. + + Returns + ------- + predictions: List[Tuple[int, bool, str, float, np.ndarray, np.ndarray]] + Model predictions for the given batch of spectra containing spectrum + scan number, decoy flag, peptide sequence, Casanovo-DB score, + amino acid-level confidence scores, and precursor information. + """ batch_res = [] for ( indexes, - t_or_d, + is_decoy, peptides, precursors, encoded_ms, ) in self.smart_batch_gen(batch): pred, truth = self.decoder(peptides, precursors, *encoded_ms) pred = self.softmax(pred) - score_result, per_aa_score = _calc_match_score(pred, truth) + score_result, per_aa_score = _calc_match_score( + pred, truth, self.decoder.reverse + ) batch_res.append( ( indexes, - t_or_d, + is_decoy, peptides, score_result.cpu().detach().numpy(), per_aa_score.cpu().detach().numpy(), @@ -1028,6 +1043,7 @@ def predict_step(self, batch, *args): return batch_res def smart_batch_gen(self, batch): + batch_size = len(batch[0]) all_psm = [] enc = self.encoder(batch[0]) precursors = batch[1] @@ -1037,7 +1053,7 @@ def smart_batch_gen(self, batch): spec_peptides = batch[2][idx].split(",") # Check for decoy prefixes and create a bit-vector indicating targets (1) or decoys (0) decoy_prefix = "decoy_" # Decoy prefix - t_or_ds = [ + decoy_mask = [ 0 if p.startswith(decoy_prefix) else 1 for p in spec_peptides ] # Remove decoy prefix @@ -1055,14 +1071,14 @@ def smart_batch_gen(self, batch): spec_precursors, spec_peptides, spec_idx, - t_or_ds, + decoy_mask, ) ) ) # Continually grab num_pairs items from all_psm until list is exhausted while len(all_psm) > 0: - batch = all_psm[: self.num_pairs] - all_psm = all_psm[self.num_pairs :] + batch = all_psm[:batch_size] + all_psm = all_psm[batch_size:] batch = list(zip(*batch)) encoded_ms = ( torch.stack([a[0] for a in batch[0]]), @@ -1071,8 +1087,8 @@ def smart_batch_gen(self, batch): prec_data = torch.stack(batch[1]) pep_str = list(batch[2]) indexes = [a[1] for a in batch[3]] - t_or_ds = batch[4] - yield (indexes, t_or_ds, pep_str, prec_data, encoded_ms) + is_decoy = batch[4] + yield (indexes, is_decoy, pep_str, prec_data, encoded_ms) def on_predict_batch_end( self, @@ -1115,7 +1131,9 @@ def on_predict_batch_end( def _calc_match_score( - batch_all_aa_scores: torch.Tensor, truth_aa_indicies: torch.Tensor + batch_all_aa_scores: torch.Tensor, + truth_aa_indicies: torch.Tensor, + decoder_reverse: bool = False, ) -> List[float]: """ Calculate the score between the input spectra and associated peptide. @@ -1135,6 +1153,8 @@ def _calc_match_score( truth_aa_indicies : torch.Tensor Indicies of the score for each actual amino acid in the peptide (for an entire batch) + decoder_reverse : bool + Whether the decoder is reversed. Returns ------- @@ -1144,8 +1164,11 @@ def _calc_match_score( a list of lists of per amino acid scores (for an entire batch) """ - # Remove trailing tokens from predictions, - batch_all_aa_scores = batch_all_aa_scores[:, :-1] + # Remove trailing tokens from predictions based on decoder reversal + if decoder_reverse: + batch_all_aa_scores = batch_all_aa_scores[:, 1:] + elif not decoder_reverse: + batch_all_aa_scores = batch_all_aa_scores[:, :-1] # Vectorized scoring using efficient indexing. rows = ( diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 14aebf8d..e150ab2d 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -322,8 +322,6 @@ def initialize_model( map_location=device, **loaded_model_params, ) - ## TODO move? - self.model.num_pairs = self.config.predict_batch_size else: self.model = Spec2Pep.load_from_checkpoint( self.model_filename, @@ -351,8 +349,6 @@ def initialize_model( map_location=device, **model_params, ) - ## TODO move? - self.model.num_pairs = self.config.predict_batch_size else: self.model = Spec2Pep.load_from_checkpoint( self.model_filename, From ea1f97df98724fa3bd9b5128792eccff11040bed Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 18 Jun 2024 18:23:55 -0700 Subject: [PATCH 18/84] cleaned up on_predict_batch_end, TODOs for calc_mz --- casanovo/denovo/model.py | 54 +++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 9dcb3e7e..2f3f9aed 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -12,6 +12,7 @@ import numpy as np import lightning.pytorch as pl from torch.utils.tensorboard import SummaryWriter +from pyteomics import mass from depthcharge.components import ModelMixin, PeptideDecoder, SpectrumEncoder from . import evaluate @@ -1043,24 +1044,26 @@ def predict_step(self, batch, *args): return batch_res def smart_batch_gen(self, batch): - batch_size = len(batch[0]) all_psm = [] + batch_size = len(batch[0]) enc = self.encoder(batch[0]) precursors = batch[1] indexes = batch[3] enc = list(zip(*enc)) - for idx, _ in enumerate(batch[0]): + for idx in range(batch_size): spec_peptides = batch[2][idx].split(",") # Check for decoy prefixes and create a bit-vector indicating targets (1) or decoys (0) decoy_prefix = "decoy_" # Decoy prefix - decoy_mask = [ - 0 if p.startswith(decoy_prefix) else 1 for p in spec_peptides - ] - # Remove decoy prefix - spec_peptides = [ - s[len(decoy_prefix) :] if s.startswith(decoy_prefix) else s - for s in spec_peptides - ] + id_decoys = np.array( + [ + (0, p.removeprefix(decoy_prefix)) + if p.startswith(decoy_prefix) + else (1, p) + for p in spec_peptides + ] + ) + decoy_mask = np.array(id_decoys[:, 0], dtype=bool) + spec_peptides = list(id_decoys[:, 1]) spec_precursors = [precursors[idx]] * len(spec_peptides) spec_enc = [enc[idx]] * len(spec_peptides) spec_idx = [indexes[idx]] * len(spec_peptides) @@ -1105,29 +1108,22 @@ def on_predict_batch_end( per_aa_score, precursors, ) in outputs: - for index, t_or_d, peptide, score, per_aa_scores, precursor in zip( - indexes, - t_or_d, + prec_mass = precursors[:, 0] + prec_charge = precursors[:, 1] + prec_mz = precursors[:, 2] + # calc_mz = [mass.fast_mass(pep, charge=int(pc)) for pep, pc in zip(peptides, prec_charge)] + calc_mz = prec_mass # TODO: Replace with actual calc_mz + for row in zip( peptides, score_result, + prec_charge, + prec_mz, + calc_mz, + indexes, per_aa_score, - precursors, + t_or_d, ): - prec_charge = precursor[1] - prec_mz = precursor[2] - calc_mz = precursor[2] - self.out_writer.psms.append( - ( - peptide, - score, - prec_charge, - prec_mz, - calc_mz, - index, - per_aa_scores, - t_or_d, - ), - ) + self.out_writer.psms.append(row) def _calc_match_score( From 8825506da091aa7aaa7dac0da78608b07fc48978 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Wed, 26 Jun 2024 18:12:17 -0700 Subject: [PATCH 19/84] add proper calc_mz calculation with depthcharge --- casanovo/denovo/model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 2f3f9aed..71f4a6fa 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1111,8 +1111,10 @@ def on_predict_batch_end( prec_mass = precursors[:, 0] prec_charge = precursors[:, 1] prec_mz = precursors[:, 2] - # calc_mz = [mass.fast_mass(pep, charge=int(pc)) for pep, pc in zip(peptides, prec_charge)] - calc_mz = prec_mass # TODO: Replace with actual calc_mz + calc_mz = [ + self.peptide_mass_calculator.mass(peptide, charge) + for peptide, charge in zip(peptides, prec_charge) + ] for row in zip( peptides, score_result, From f25ace84af22ea0eefb0a96a705cfc7957912d13 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 2 Jul 2024 16:22:02 -0700 Subject: [PATCH 20/84] rough implementation --- casanovo/casanovo.py | 90 +++++++++++++++++- casanovo/data/datasets.py | 14 +-- casanovo/data/db_utils.py | 156 ++++++++++++++++++++++++++++++++ casanovo/data/ms_io.py | 4 +- casanovo/denovo/dataloaders.py | 57 +----------- casanovo/denovo/model.py | 62 ++++++------- casanovo/denovo/model_runner.py | 39 ++++++-- 7 files changed, 307 insertions(+), 115 deletions(-) create mode 100644 casanovo/data/db_utils.py diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 7db5faa8..df3cc79f 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -214,8 +214,74 @@ def annotate( nargs=-1, type=click.Path(exists=True, dir_okay=False), ) +@click.argument( + "fasta_path", + required=True, + nargs=1, + type=click.Path(exists=True, dir_okay=False), +) +@click.option( + "--enzyme", + help="Enzyme for in silico digestion, see pyteomics.parser.expasy_rules", + type=str, + default="trypsin", +) +@click.option( + "--digestion", + help="Digestion: full, partial", + type=click.Choice( + ["full", "partial"], + case_sensitive=False, + ), + default="full", +) +@click.option( + "--missed_cleavages", + help="Number of allowed missed cleavages", + type=int, + default=0, +) +@click.option( + "--max_mods", + help="Maximum number of modifications per peptide", + type=int, + default=0, +) +@click.option( + "--min_length", + help="Minimum peptide length", + type=int, + default=6, +) +@click.option( + "--max_length", + help="Maximum peptide length", + type=int, + default=50, +) +@click.option( + "--precursor_tolerance", + help="Precursor tolerance window size (ppm)", + type=int, + default=20, +) +@click.option( + "--isotope_error", + help="Isotope error levels to consider (list of ints, e.g: 1,2)", + type=str, + default="0", +) def db_search( peak_path: Tuple[str], + fasta_path: str, + enzyme: str, + digestion: str, + missed_cleavages: int, + max_mods: int, + min_length: int, + max_length: int, + precursor_tolerance: int, + isotope_error: str, model: Optional[str], config: Optional[str], output: Optional[str], @@ -223,14 +289,30 @@ def db_search( ) -> None: """Perform a search using Casanovo-DB. - PEAK_PATH must be one MGF file that has ANNOTATED spectra, - as output by annotate mode. + PEAK_PATH must be one MGF file. FASTA_PATH must be one FASTA file. """ output = setup_logging(output, verbosity) config, model = setup_model(model, config, output, False) with ModelRunner(config, model) as runner: - logger.info("DB-searching peptides from: %s", peak_path) - runner.db_search(peak_path, output) + logger.info("Performing database search on:") + for peak_file in peak_path: + logger.info(" %s", peak_file) + logger.info("Using the following FASTA file:") + logger.info(" %s", fasta_path) + + runner.db_search( + peak_path, + fasta_path, + enzyme, + digestion, + missed_cleavages, + max_mods, + min_length, + max_length, + precursor_tolerance, + isotope_error, + output, + ) logger.info("DONE!") diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py index aff6af85..59f56b68 100644 --- a/casanovo/data/datasets.py +++ b/casanovo/data/datasets.py @@ -134,6 +134,8 @@ def _process_peaks( The precursor m/z. precursor_charge : int The precursor charge. + track_spectrum_id : Optional[bool] + Whether to keep track of the identifier of the MS/MS spectra. Returns ------- @@ -212,8 +214,6 @@ class AnnotatedSpectrumDataset(SpectrumDataset): random_state : Optional[int] The NumPy random state. ``None`` leaves mass spectra in the order they were parsed. - track_spectrum_id : Optional[bool] - Whether to keep track of the identifier of the MS/MS spectra. """ def __init__( @@ -225,7 +225,6 @@ def __init__( min_intensity: float = 0.01, remove_precursor_tol: float = 2.0, random_state: Optional[int] = None, - track_spectrum_id: Optional[bool] = False, ): super().__init__( annotated_spectrum_index, @@ -236,7 +235,6 @@ def __init__( remove_precursor_tol=remove_precursor_tol, random_state=random_state, ) - self.track_spectrum_id = track_spectrum_id def __getitem__(self, idx: int) -> Tuple[torch.Tensor, float, int, str]: """ @@ -268,12 +266,4 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, float, int, str]: spectrum = self._process_peaks( mz_array, int_array, precursor_mz, precursor_charge ) - if self.track_spectrum_id: - return ( - spectrum, - precursor_mz, - precursor_charge, - peptide, - self.get_spectrum_id(idx), - ) return spectrum, precursor_mz, precursor_charge, peptide diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py new file mode 100644 index 00000000..c961e35e --- /dev/null +++ b/casanovo/data/db_utils.py @@ -0,0 +1,156 @@ +"""Unique methods used within db-search mode""" + +import os +import depthcharge.masses +from pyteomics import fasta, parser +import bisect + +HYDROGEN = 1.007825035 +OXYGEN = 15.99491463 +H2O = 2 * HYDROGEN + OXYGEN +PROTON = 1.00727646677 +ISOTOPE_SPACING = 1.003355 # - 0.00288 + +var_mods = { + "d": ["N", "Q"], + "ox": ["M"], + "ace-": True, + "carb-": True, + "nh3x-": True, + "carbnh3x-": True, +} +fixed_mods = {"carbm": ["C"]} + + +def convert_from_modx(seq): + """Converts peptide sequence from modX format to Casanovo-acceptable modifications. + + Args: + seq (str): Peptide in modX format + """ + seq = seq.replace("carbmC", "C+57.021") # Fixed modification + seq = seq.replace("oxM", "M+15.995") + seq = seq.replace("dN", "N+0.984") + seq = seq.replace("dQ", "Q+0.984") + seq = seq.replace("ace-", "+42.011") + seq = seq.replace("carbnh3x-", "+43.006-17.027") + seq = seq.replace("carb-", "+43.006") + seq = seq.replace("nh3x-", "-17.027") + return seq + + +def digest_fasta( + fasta_filename, + enzyme, + digestion, + missed_cleavages, + max_mods, + min_length, + max_length, +): + """TODO: Add docstring""" + + # Verify the eistence of the file: + if not os.path.isfile(fasta_filename): + print(f"File {fasta_filename} does not exist.") + raise FileNotFoundError(f"File {fasta_filename} does not exist.") + + fasta_data = fasta.read(fasta_filename) + peptide_list = [] + if digestion in ["full", "partial"]: + semi = True if digestion == "partial" else False + for header, seq in fasta_data: + pep_set = parser.cleave( + seq, + rule=parser.expasy_rules[enzyme], + missed_cleavages=missed_cleavages, + semi=semi, + ) + protein = header.split()[0] + peptide_list.extend([(pep, protein) for pep in pep_set]) + else: + raise ValueError(f"Digestion type {digestion} not recognized.") + + # Generate modified peptides + mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") + mass_calculator.masses.update({"X": 0.0}) # TODO: REMOVE? + mod_peptide_list = [] + for pep, prot in peptide_list: + if len(pep) < min_length or len(pep) > max_length: + continue + peptide_isoforms = parser.isoforms( + pep, + variable_mods=var_mods, + fixed_mods=fixed_mods, + max_mods=max_mods, + ) + peptide_isoforms = list(map(convert_from_modx, peptide_isoforms)) + mod_peptide_list.extend( + (mod_pep, mass_calculator.mass(mod_pep), prot) + for mod_pep in peptide_isoforms + ) + + # Sort the peptides by mass and return. + mod_peptide_list.sort(key=lambda x: x[1]) + return mod_peptide_list + + +def get_candidates( + precursor_mass, charge, peptide_list, precursor_tolerance, isotope_error +): + """TODO: ADD DOCSTRING""" + + candidates = set() + + isotope_error = [int(x) for x in isotope_error.split(",")] + for e in isotope_error: + iso_shift = ISOTOPE_SPACING * e + upper_bound = (_to_raw_mass(precursor_mass, charge) - iso_shift) * ( + 1 + (precursor_tolerance / 1e6) + ) + lower_bound = (_to_raw_mass(precursor_mass, charge) - iso_shift) * ( + 1 - (precursor_tolerance / 1e6) + ) + + start, end = get_mass_indices( + [x[1] for x in peptide_list], lower_bound, upper_bound + ) + + candidates.update(peptide_list[start:end]) + + candidates = list(candidates) + candidates.sort(key=lambda x: x[1]) + return candidates + + +def _to_mz(precursor_mass, charge): + """TODO: ADD DOCSTRING""" + return (precursor_mass + (charge * PROTON)) / charge + + +def _to_raw_mass(mz_mass, charge): + """TODO: ADD DOCSTRING""" + return charge * (mz_mass - PROTON) + + +def get_mass_indices(masses, m_low, m_high): + """Grabs mass indices from a list of mass values that fall within a specified range. + Requires that the mass values are sorted in ascending order. + + Parameters + ---------- + masses : List[int] + List of mass values + m_low : int + Lower bound of mass range (inclusive) + m_high : int + Upper bound of mass range (inclusive) + + Return + ------ + indices : Tuple[int, int] + Indices of mass values that fall within the specified range + """ + start = bisect.bisect_left(masses, m_low) + end = bisect.bisect_right(masses, m_high) + return start, end diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index c4cfc7cb..d47b9b04 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -251,7 +251,6 @@ def save_db_variant(self) -> None: "start", "end", "opt_ms_run[1]_aa_scores", - "opt_cv_MS:1002217_decoy_peptide", ] ) for i, psm in enumerate(self.psms): @@ -259,7 +258,7 @@ def save_db_variant(self) -> None: [ "PSM", psm[0], # sequence - f"{psm[5]}:{i}", # spectra_ref + f"{psm[5]}:{i}", # PSM_ID (spectrum # :candidate #) "null", # accession "null", # unique "null", # database @@ -284,6 +283,5 @@ def save_db_variant(self) -> None: ) ) ), # opt_ms_run[1]_aa_scores - bool(psm[7]), # opt_cv_MS:1002217_decoy_peptide ] ) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index aff860a1..ba02936c 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -127,13 +127,12 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: self.test_dataset = make_dataset(self.test_index) if stage == "db": make_dataset = functools.partial( - AnnotatedSpectrumDataset, + SpectrumDataset, n_peaks=self.n_peaks, min_mz=self.min_mz, max_mz=self.max_mz, min_intensity=self.min_intensity, remove_precursor_tol=self.remove_precursor_tol, - track_spectrum_id=True, ) if self.test_index is not None: self.test_dataset = make_dataset(self.test_index) @@ -143,7 +142,6 @@ def _make_loader( dataset: torch.utils.data.Dataset, batch_size: int, shuffle: bool = False, - db_mode: bool = False, ) -> torch.utils.data.DataLoader: """ Create a PyTorch DataLoader. @@ -167,7 +165,7 @@ def _make_loader( return torch.utils.data.DataLoader( dataset, batch_size=batch_size, - collate_fn=prepare_batch if not db_mode else prepare_db_batch, + collate_fn=prepare_batch, pin_memory=True, num_workers=self.n_workers, shuffle=shuffle, @@ -191,12 +189,6 @@ def predict_dataloader(self) -> torch.utils.data.DataLoader: """Get the predict DataLoader.""" return self._make_loader(self.test_dataset, self.eval_batch_size) - def db_dataloader(self) -> torch.utils.data.DataLoader: - """Get the predict DataLoader.""" - return self._make_loader( - self.test_dataset, self.eval_batch_size, db_mode=True - ) - def prepare_batch( batch: List[Tuple[torch.Tensor, float, int, str]] @@ -235,48 +227,3 @@ def prepare_batch( [precursor_masses, precursor_charges, precursor_mzs] ).T.float() return spectra, precursors, np.asarray(spectrum_ids) - - -def prepare_db_batch( - batch: List[Tuple[torch.Tensor, float, int, str, Tuple[str, str]]] -) -> Tuple[torch.Tensor, torch.Tensor, np.ndarray, Tuple[str, str]]: - """ - Collate MS/MS spectra into a batch meant for Casanovo-DB. - - Parameters - ---------- - batch : List[Tuple[torch.Tensor, Tuple[float, int, float], str, Tuple[str, str]]] - A batch of data from an AnnotatedSpectrumDataset, consisting of for each - spectrum (i) a tensor with the m/z and intensity peak values, - (ii) the precursor information [mass, charge, m/z], (iii) the - peptide sequence, the precursor m/z, (iv) spectrum identifiers - (file and scan). - - Returns - ------- - spectra : torch.Tensor of shape (batch_size, n_peaks, 2) - The padded mass spectra tensor with the m/z and intensity peak values - for each spectrum. - precursors : torch.Tensor of shape (batch_size, 3) - A tensor with the precursor neutral mass, precursor charge, and - precursor m/z. - spectrum_peps : np.ndarray - Peptide sequences - spectrum_ids : Tuple[str, str] - Peak file and spectrum identifier - """ - ( - spectra, - precursor_mzs, - precursor_charges, - spectrum_peps, - spectrum_ids, - ) = list(zip(*batch)) - spectra = torch.nn.utils.rnn.pad_sequence(spectra, batch_first=True) - precursor_mzs = torch.tensor(precursor_mzs) - precursor_charges = torch.tensor(precursor_charges) - precursor_masses = (precursor_mzs - 1.007276) * precursor_charges - precursors = torch.vstack( - [precursor_masses, precursor_charges, precursor_mzs] - ).T.float() - return spectra, precursors, np.asarray(spectrum_peps), spectrum_ids diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 71f4a6fa..be7dba9a 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -17,7 +17,7 @@ from . import evaluate from .. import config -from ..data import ms_io +from ..data import ms_io, db_utils logger = logging.getLogger("casanovo") @@ -1009,19 +1009,18 @@ def predict_step(self, batch, *args): ---------- batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor] A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - spectrum identifiers as torch Tensors, (iv) scan numbers. + spectrum identifiers as torch Tensors Returns ------- - predictions: List[Tuple[int, bool, str, float, np.ndarray, np.ndarray]] + predictions: List[Tuple[int, str, float, np.ndarray, np.ndarray]] Model predictions for the given batch of spectra containing spectrum - scan number, decoy flag, peptide sequence, Casanovo-DB score, + scan number, peptide sequence, Casanovo-DB score, amino acid-level confidence scores, and precursor information. """ batch_res = [] for ( indexes, - is_decoy, peptides, precursors, encoded_ms, @@ -1034,7 +1033,6 @@ def predict_step(self, batch, *args): batch_res.append( ( indexes, - is_decoy, peptides, score_result.cpu().detach().numpy(), per_aa_score.cpu().detach().numpy(), @@ -1043,27 +1041,25 @@ def predict_step(self, batch, *args): ) return batch_res - def smart_batch_gen(self, batch): + def smart_batch_gen(self, spectrum_batch): + """TODO: ADD DOCSTRING""" all_psm = [] - batch_size = len(batch[0]) - enc = self.encoder(batch[0]) - precursors = batch[1] - indexes = batch[3] + batch_size = len(spectrum_batch[0]) + enc = self.encoder(spectrum_batch[0]) enc = list(zip(*enc)) + precursors = spectrum_batch[1] + indexes = spectrum_batch[2] for idx in range(batch_size): - spec_peptides = batch[2][idx].split(",") - # Check for decoy prefixes and create a bit-vector indicating targets (1) or decoys (0) - decoy_prefix = "decoy_" # Decoy prefix - id_decoys = np.array( - [ - (0, p.removeprefix(decoy_prefix)) - if p.startswith(decoy_prefix) - else (1, p) - for p in spec_peptides - ] + spec_peptides = db_utils.get_candidates( + precursors[idx][2], + precursors[idx][1], + self.digest, + self.precursor_tolerance, + self.isotope_error, ) - decoy_mask = np.array(id_decoys[:, 0], dtype=bool) - spec_peptides = list(id_decoys[:, 1]) + spec_peptides = [ + a[0] for a in spec_peptides + ] # TODO: USE MASS AND PROTEIN INFORMATION spec_precursors = [precursors[idx]] * len(spec_peptides) spec_enc = [enc[idx]] * len(spec_peptides) spec_idx = [indexes[idx]] * len(spec_peptides) @@ -1074,24 +1070,22 @@ def smart_batch_gen(self, batch): spec_precursors, spec_peptides, spec_idx, - decoy_mask, ) ) ) # Continually grab num_pairs items from all_psm until list is exhausted while len(all_psm) > 0: - batch = all_psm[:batch_size] + psm_batch = all_psm[:batch_size] all_psm = all_psm[batch_size:] - batch = list(zip(*batch)) + psm_batch = list(zip(*psm_batch)) encoded_ms = ( - torch.stack([a[0] for a in batch[0]]), - torch.stack([a[1] for a in batch[0]]), + torch.stack([a[0] for a in psm_batch[0]]), + torch.stack([a[1] for a in psm_batch[0]]), ) - prec_data = torch.stack(batch[1]) - pep_str = list(batch[2]) - indexes = [a[1] for a in batch[3]] - is_decoy = batch[4] - yield (indexes, is_decoy, pep_str, prec_data, encoded_ms) + prec_data = torch.stack(psm_batch[1]) + pep_str = list(psm_batch[2]) + indexes = [a[1] for a in psm_batch[3]] + yield (indexes, pep_str, prec_data, encoded_ms) def on_predict_batch_end( self, @@ -1102,7 +1096,6 @@ def on_predict_batch_end( return for ( indexes, - t_or_d, peptides, score_result, per_aa_score, @@ -1123,7 +1116,6 @@ def on_predict_batch_end( calc_mz, indexes, per_aa_score, - t_or_d, ): self.out_writer.psms.append(row) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index e150ab2d..73dfdff2 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -18,7 +18,7 @@ from lightning.pytorch.callbacks import ModelCheckpoint from ..config import Config -from ..data import ms_io +from ..data import ms_io, db_utils from ..denovo.dataloaders import DeNovoDataModule from ..denovo.model import Spec2Pep, DbSpec2Pep @@ -79,13 +79,29 @@ def __exit__(self, exc_type, exc_value, traceback): if self.writer is not None: self.writer.save() - def db_search(self, peak_path: Iterable[str], output: str) -> None: + def db_search( + self, + peak_path: Iterable[str], + fasta_path: str, + enzyme: str, + digestion: str, + missed_cleavages: int, + max_mods: int, + min_length: int, + max_length: int, + precursor_tolerance: float, + isotope_error: float, + output: str, + ) -> None: """Perform database search with Casanovo. Parameters ---------- - peak_path : iterable of str - The path to the annotated .mgf data files for database search. + peak_path : Iterable[str] + The path to the .mgf data file for database search. + fasta_path : str + The path to the FASTA file for database search. + # TODO: ADD ALL DOCUMENTATION output : str Where should the output be saved? @@ -105,12 +121,23 @@ def db_search(self, peak_path: Iterable[str], output: str) -> None: self.initialize_trainer(train=True) self.initialize_model(train=False, db_search=True) self.model.out_writer = self.writer + self.model.digest = db_utils.digest_fasta( + fasta_path, + enzyme, + digestion, + missed_cleavages, + max_mods, + min_length, + max_length, + ) + self.model.precursor_tolerance = precursor_tolerance + self.model.isotope_error = isotope_error - test_index = self._get_index(peak_path, True, "db search") + test_index = self._get_index(peak_path, False, "db search") self.writer.set_ms_run(test_index.ms_files) self.initialize_data_module(test_index=test_index) self.loaders.setup(stage="db") - self.trainer.predict(self.model, self.loaders.db_dataloader()) + self.trainer.predict(self.model, self.loaders.predict_dataloader()) def train( self, From f7dfbc8356d8993c219dbfaeccf59753f555fa07 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 2 Jul 2024 19:54:56 -0700 Subject: [PATCH 21/84] tested implementation of db search --- casanovo/casanovo.py | 107 ++++---- casanovo/data/annotate_db.py | 138 ---------- casanovo/data/db_utils.py | 109 ++++++-- casanovo/data/ms_io.py | 2 +- casanovo/denovo/model.py | 32 ++- casanovo/denovo/model_runner.py | 19 +- tests/conftest.py | 51 +++- tests/test_integration.py | 99 +------- tests/unit_tests/test_unit.py | 430 +++++++++++++++++++++++++++++++- 9 files changed, 666 insertions(+), 321 deletions(-) delete mode 100644 casanovo/data/annotate_db.py diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index df3cc79f..8ae9a81b 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -42,7 +42,6 @@ from . import utils from .denovo import ModelRunner from .config import Config -from .data.annotate_db import annotate_mgf logger = logging.getLogger("casanovo") click.rich_click.USE_MARKDOWN = True @@ -146,67 +145,6 @@ def sequence( logger.info("DONE!") -@main.command() -@click.argument( - "peak_path", - required=True, - nargs=1, - type=click.Path(exists=True, dir_okay=False), -) -@click.argument( - "tide_path", - required=True, - nargs=1, - type=click.Path(exists=True, dir_okay=True), -) -@click.option( - "-o", - "--output", - help="The output annotated MGF file.", - type=click.Path(dir_okay=False), -) -@click.option( - "-v", - "--verbosity", - help=""" - Set the verbosity of console logging messages. Log files are - always set to 'debug'. - """, - type=click.Choice( - ["debug", "info", "warning", "error"], - case_sensitive=False, - ), - default="info", -) -def annotate( - peak_path: str, - tide_path: str, - output: Optional[str], - verbosity: str, -) -> None: - """Annotate a given .mgf with candidates as selected by a Tide search for Casanovo-DB. - - PEAK_PATH must be one MGF file from which to annotate spectra. - - TIDE_PATH must be one directory containing the Tide search results of the .mgf. - This directory must contain tide-search.decoy.txt and tide-search.target.txt - """ - if output is None: - output = setup_logging(output, verbosity) - logger.info( - "Output file not specified. \ - Annotated MGF will be saved in the same directory \ - as the input MGF." - ) - output = peak_path.replace(".mgf", "_annotated.mgf") - else: - output = setup_logging(output, verbosity) - - annotate_mgf(peak_path, tide_path, output) - - logger.info("DONE!") - - @main.command(cls=_SharedParams) @click.argument( "peak_path", @@ -222,8 +160,47 @@ def annotate( ) @click.option( "--enzyme", - help="Enzyme for in silico digestion, see pyteomics.parser.expasy_rules", - type=str, + help="Enzyme for in silico digestion, \ + See pyteomics.parser.expasy_rules for valid enzymes", + type=click.Choice( + [ + "arg-c", + "asp-n", + "bnps-skatole", + "caspase 1", + "caspase 2", + "caspase 3", + "caspase 4", + "caspase 5", + "caspase 6", + "caspase 7", + "caspase 8", + "caspase 9", + "caspase 10", + "chymotrypsin high specificity", + "chymotrypsin low specificity", + "clostripain", + "cnbr", + "enterokinase", + "factor xa", + "formic acid", + "glutamyl endopeptidase", + "granzyme b", + "hydroxylamine", + "iodosobenzoic acid", + "lysc", + "ntcb", + "pepsin ph1.3", + "pepsin ph2.0", + "proline endopeptidase", + "proteinase k", + "staphylococcal peptidase i", + "thermolysin", + "thrombin", + "trypsin", + "trypsin_exception", + ] + ), default="trypsin", ) @click.option( @@ -287,7 +264,7 @@ def db_search( output: Optional[str], verbosity: str, ) -> None: - """Perform a search using Casanovo-DB. + """Perform a database search on MS/MS data using Casanovo-DB. PEAK_PATH must be one MGF file. FASTA_PATH must be one FASTA file. """ diff --git a/casanovo/data/annotate_db.py b/casanovo/data/annotate_db.py deleted file mode 100644 index dd2e6c64..00000000 --- a/casanovo/data/annotate_db.py +++ /dev/null @@ -1,138 +0,0 @@ -"""Methods used to annotate an .mgf so that it can be used by Casanovo-DB""" - -from pathlib import Path -from typing import Optional, Tuple -import os -import re -import logging - -import pandas as pd -import pyteomics.mgf as mgf - - -def _normalize_mods(seq: str) -> str: - """ - Turns tide-style modifications into the format used by Casanovo-DB. - - Parameters - ---------- - seq : str - The peptide sequence with tide-style modifications. - - Returns - ------- - str - The peptide sequence with Casanovo-DB-style modifications. - """ - logger = logging.getLogger("casanovo") - seq = seq.replace("C", "C+57.021") - seq = re.sub(r"M\[15\.[0-9]*\]", r"M+15.995", seq) - seq = re.sub(r"N\[0\.9[0-9]*\]", r"N+0.984", seq) - seq = re.sub(r"Q\[0\.9[0-9]*\]", r"Q+0.984", seq) - seq = re.sub(r"(.*)\[42\.[0-9]*\]", r"+42.011\1", seq) - seq = re.sub(r"(.*)\[43\.[0-9]*\]", r"+43.006\1", seq) - seq = re.sub(r"(.*)\[\-17\.[0-9]*\]", r"-17.027\1", seq) - seq = re.sub(r"(.*)\[25\.[0-9]*\]", r"+43.006-17.027\1", seq) - return seq - - -def annotate_mgf(peak_path: str, tide_path: str, output: Optional[str]): - """ - Accepts a directory containing the results of a successful tide search, - and an .mgf file containing MS/MS spectra. - The .mgf file is then annotated in the SEQ field with - all of the candidate peptides for each spectrum, as well as their target/decoy status. - This annotated .mgf can be given directly to Casanovo-DB to perfrom a database search. - - Parameters - ---------- - tide_dir_path : str - Path to the directory containing the results of a successful tide search. - mgf_file : str - Path to the .mgf file containing MS/MS spectra. - output_file : str - Path to where the annotated .mgf will be written. - - """ - logger = logging.getLogger("casanovo") - # Get paths to tide search text files - tdf_path = os.path.join(tide_path, "tide-search.target.txt") - ddf_path = os.path.join(tide_path, "tide-search.decoy.txt") - try: - target_df = pd.read_csv( - tdf_path, sep="\t", usecols=["scan", "sequence", "target/decoy"] - ) - decoy_df = pd.read_csv( - ddf_path, sep="\t", usecols=["scan", "sequence", "target/decoy"] - ) - except FileNotFoundError as e: - logger.error( - "Could not find tide search results in the specified directory. " - "Please ensure that the directory contains the following files: " - "tide-search.target.txt and tide-search.decoy.txt" - ) - raise e - - logger.info("Successfully read tide search results from %s.", tide_path) - - df = pd.concat([target_df, decoy_df]) - scan_groups = df.groupby("scan")[["sequence", "target/decoy"]] - - scan_map = {} - - for scan, item in scan_groups: - td_group = item.groupby("target/decoy")["sequence"].apply(list) - if "target" in td_group.index: - target_candidate_list = list( - map( - _normalize_mods, - td_group["target"], - ) - ) - else: - target_candidate_list = [] - logger.warn(f"No target peptides found for scan {scan}.") - if "decoy" in td_group.index: - decoy_candidate_list = list( - map( - _normalize_mods, - td_group["decoy"], - ) - ) - decoy_candidate_list = list( - map(lambda x: "decoy_" + str(x), decoy_candidate_list) - ) - else: - decoy_candidate_list = [] - logger.warn(f"No decoy peptides found for scan {scan}.") - - pep_list = target_candidate_list + decoy_candidate_list - if len(pep_list) == 0: - logger.warn(f"No peptides found for scan {scan}.") - else: - scan_map[scan] = target_candidate_list + decoy_candidate_list - - all_spec = [] - for idx, spec_dict in enumerate(mgf.read(peak_path)): - try: - scan = int(spec_dict["params"]["scans"]) - except KeyError as e: - logger.error( - "Could not find the scan number in the .mgf file." - "Please ensure that the .mgf file contains the scan number in the 'SCANS' field." - ) - raise e - try: - spec_dict["params"]["seq"] = ",".join(list(scan_map[scan])) - all_spec.append(spec_dict) - except KeyError as e: - # No need to do anything if the scan is not found in the scan map - pass - try: - output = str(output) - mgf.write(all_spec, output, file_mode="w") - logger.info("Annotated .mgf file written to %s.", output) - except Exception as e: - logger.error( - "Write to %s failed. Check if the file path is correct.", output - ) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index c961e35e..341a6162 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -5,11 +5,14 @@ from pyteomics import fasta, parser import bisect +from typing import List, Tuple + +# CONSTANTS HYDROGEN = 1.007825035 OXYGEN = 15.99491463 H2O = 2 * HYDROGEN + OXYGEN PROTON = 1.00727646677 -ISOTOPE_SPACING = 1.003355 # - 0.00288 +ISOTOPE_SPACING = 1.003355 var_mods = { "d": ["N", "Q"], @@ -22,7 +25,7 @@ fixed_mods = {"carbm": ["C"]} -def convert_from_modx(seq): +def convert_from_modx(seq: str): """Converts peptide sequence from modX format to Casanovo-acceptable modifications. Args: @@ -40,15 +43,41 @@ def convert_from_modx(seq): def digest_fasta( - fasta_filename, - enzyme, - digestion, - missed_cleavages, - max_mods, - min_length, - max_length, + fasta_filename: str, + enzyme: str, + digestion: str, + missed_cleavages: int, + max_mods: int, + min_length: int, + max_length: int, ): - """TODO: Add docstring""" + """ + Digests a FASTA file and returns the peptides, their masses, and associated protein. + + Parameters + ---------- + fasta_filename : str + Path to the FASTA file. + enzyme : str + The enzyme to use for digestion. + See pyteomics.parser.expasy_rules for valid enzymes. + digestion : str + The type of digestion to perform. Either 'full' or 'partial'. + missed_cleavages : int + The number of missed cleavages to allow. + max_mods : int + The maximum number of modifications to allow per peptide. + min_length : int + The minimum length of peptides to consider. + max_length : int + The maximum length of peptides to consider. + + Returns + ------- + mod_peptide_list : List[Tuple[str, float, str]] + A list of tuples containing the peptide sequence, mass, + and associated protein. Sorted by neutral mass in ascending order. + """ # Verify the eistence of the file: if not os.path.isfile(fasta_filename): @@ -96,19 +125,39 @@ def digest_fasta( def get_candidates( - precursor_mass, charge, peptide_list, precursor_tolerance, isotope_error + precursor_mz: float, + charge: int, + peptide_list: List[Tuple[str, float, str]], + precursor_tolerance: int, + isotope_error: str, ): - """TODO: ADD DOCSTRING""" + """ + Returns a list of candidate peptides that fall within the specified mass range. + + Parameters + ---------- + precursor_mz : float + The precursor mass-to-charge ratio. + charge : int + The precursor charge. + peptide_list : List[Tuple[str, float, str]] + A list of tuples containing the peptide sequence, mass, and associated protein. + Must be sorted by mass in ascending order. Uses neutral masses. + precursor_tolerance : float + The precursor mass tolerance in parts-per-million. + isotope_error : str + The isotope error levels to consider. + """ candidates = set() isotope_error = [int(x) for x in isotope_error.split(",")] for e in isotope_error: iso_shift = ISOTOPE_SPACING * e - upper_bound = (_to_raw_mass(precursor_mass, charge) - iso_shift) * ( + upper_bound = (_to_raw_mass(precursor_mz, charge) - iso_shift) * ( 1 + (precursor_tolerance / 1e6) ) - lower_bound = (_to_raw_mass(precursor_mass, charge) - iso_shift) * ( + lower_bound = (_to_raw_mass(precursor_mz, charge) - iso_shift) * ( 1 - (precursor_tolerance / 1e6) ) @@ -124,12 +173,40 @@ def get_candidates( def _to_mz(precursor_mass, charge): - """TODO: ADD DOCSTRING""" + """ + Convert precursor neutral mass to m/z value. + + Parameters + ---------- + precursor_mass : float + The precursor neutral mass. + charge : int + The precursor charge. + + Returns + ------- + mz : float + The calculated precursor mass-to-charge ratio. + """ return (precursor_mass + (charge * PROTON)) / charge def _to_raw_mass(mz_mass, charge): - """TODO: ADD DOCSTRING""" + """ + Convert precursor m/z value to neutral mass. + + Parameters + ---------- + mz_mass : float + The precursor mass-to-charge ratio. + charge : int + The precursor charge. + + Returns + ------- + mass : float + The calculated precursor neutral mass. + """ return charge * (mz_mass - PROTON) diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index d47b9b04..a701b627 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -221,7 +221,7 @@ def save_db_variant(self) -> None: Export the Casanovo-DB search results to the mzTab file. Outputs PSMs in the order they were scored - (i.e. the order in the annotated .mgf file). + (i.e. the order in the .mgf file). """ with open(self.filename, "w", newline="") as f: writer = csv.writer(f, delimiter="\t", lineterminator=os.linesep) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index be7dba9a..4d9bd41b 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1009,7 +1009,7 @@ def predict_step(self, batch, *args): ---------- batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor] A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - spectrum identifiers as torch Tensors + spectrum identifiers as torch Tensors. Returns ------- @@ -1042,7 +1042,21 @@ def predict_step(self, batch, *args): return batch_res def smart_batch_gen(self, spectrum_batch): - """TODO: ADD DOCSTRING""" + """ + Transforms a batch of spectra into multiple equally-sized batches of PSMs. + + Parameters + ---------- + spectrum batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor] + A batch of (i) MS/MS spectra, (ii) precursor information, (iii) + spectrum identifiers as torch Tensors. + + Yields + ------- + psm_batch: Tuple[List[int], List[str], torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] + A batch of PSMs containing the spectrum index, peptide sequence, + precursor information, and encoded MS/MS spectra. + """ all_psm = [] batch_size = len(spectrum_batch[0]) enc = self.encoder(spectrum_batch[0]) @@ -1050,16 +1064,22 @@ def smart_batch_gen(self, spectrum_batch): precursors = spectrum_batch[1] indexes = spectrum_batch[2] for idx in range(batch_size): - spec_peptides = db_utils.get_candidates( + digest_data = db_utils.get_candidates( precursors[idx][2], precursors[idx][1], self.digest, self.precursor_tolerance, self.isotope_error, ) - spec_peptides = [ - a[0] for a in spec_peptides - ] # TODO: USE MASS AND PROTEIN INFORMATION + logger.debug("%s", digest_data) + try: + spec_peptides, pep_masses, pep_protein = list( + zip(*digest_data) + ) + except ValueError: + logger.info( + "No peptides found for precursor %s", precursors[idx] + ) spec_precursors = [precursors[idx]] * len(spec_peptides) spec_enc = [enc[idx]] * len(spec_peptides) spec_idx = [indexes[idx]] * len(spec_peptides) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 73dfdff2..284acbe8 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -90,7 +90,7 @@ def db_search( min_length: int, max_length: int, precursor_tolerance: float, - isotope_error: float, + isotope_error: str, output: str, ) -> None: """Perform database search with Casanovo. @@ -101,7 +101,22 @@ def db_search( The path to the .mgf data file for database search. fasta_path : str The path to the FASTA file for database search. - # TODO: ADD ALL DOCUMENTATION + enzyme : str + The enzyme used for digestion. + digestion : str + The digestion type, full or partial. + missed_cleavages : int + The number of missed cleavages allowed. + max_mods : int + The maximum number of modifications allowed per peptide. + min_length : int + The minimum peptide length. + max_length : int + The maximum peptide length. + precursor_tolerance : float + The precursor mass tolerance in ppm. + isotope_error : str + Isotope error levels to consider, in comma-delineated string form. output : str Where should the output be saved? diff --git a/tests/conftest.py b/tests/conftest.py index eed4f39a..cac1a873 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,7 +5,7 @@ import psims import pytest import yaml -from pyteomics.mass import calculate_mass +from pyteomics.mass import calculate_mass, fast_mass, std_aa_mass @pytest.fixture @@ -263,6 +263,36 @@ def tiny_config(tmp_path): return cfg_file +@pytest.fixture +def tiny_fasta_file(tmp_path, fasta_raw_data): + fasta_file = tmp_path / "tiny_fasta.fasta" + with fasta_file.open("w+") as fasta_ref: + fasta_ref.write(fasta_raw_data) + + return fasta_file + + +@pytest.fixture +def fasta_raw_data(): + return ">foo\nMEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP" + + +@pytest.fixture +def mgf_db_search(tmp_path): + """An MGF file with 2 unannotated spectra and scan numbers.""" + peptides = [ + "ATSIPAR", + "VTLSCR", + "LLIYGASTR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP", + ] + mgf_file = tmp_path / "db_search.mgf" + return _create_unannotated_mgf(peptides, mgf_file, c_mod=True) + + @pytest.fixture def mgf_small_unannotated(tmp_path): """An MGF file with 2 unannotated spectra and scan numbers.""" @@ -271,7 +301,7 @@ def mgf_small_unannotated(tmp_path): return _create_unannotated_mgf(peptides, mgf_file) -def _create_unannotated_mgf(peptides, mgf_file, random_state=999): +def _create_unannotated_mgf(peptides, mgf_file, random_state=999, c_mod=False): """ Create a fake MGF file from one or more peptides. This file will have no SEQ= parameter, but will have a SCANS= parameter. @@ -284,6 +314,9 @@ def _create_unannotated_mgf(peptides, mgf_file, random_state=999): The MGF file to create. random_state : int or numpy.random.Generator, optional The random seed. The charge states are chosen to be 2 or 3 randomly. + c_mod : bool, optional + Whether to use the constant carbamidomethylation + of C in mass calculations. Returns ------- @@ -291,7 +324,7 @@ def _create_unannotated_mgf(peptides, mgf_file, random_state=999): """ rng = np.random.default_rng(random_state) entries = [ - _create_unannotated_mgf_entry(p, idx, rng.choice([2, 3])) + _create_unannotated_mgf_entry(p, idx, rng.choice([2, 3]), c_mod=c_mod) for idx, p in enumerate(peptides) ] with mgf_file.open("w+") as mgf_ref: @@ -300,7 +333,7 @@ def _create_unannotated_mgf(peptides, mgf_file, random_state=999): return mgf_file -def _create_unannotated_mgf_entry(peptide, scan_num, charge): +def _create_unannotated_mgf_entry(peptide, scan_num, charge, c_mod=False): """ Create a MassIVE-KB style MGF entry for a single PSM. Each entry will have no SEQ= parameter, but will have a SCANS= parameter. @@ -313,13 +346,21 @@ def _create_unannotated_mgf_entry(peptide, scan_num, charge): The scan number. charge : int, optional The peptide charge state. + c_mod : bool, optional + Whether to use the constant carbamidomethylation + of C in mass calculations. Returns ------- str The PSM entry in an MGF file format. """ - precursor_mz = calculate_mass(peptide, charge=int(charge)) + if not c_mod: + precursor_mz = calculate_mass(peptide, charge=int(charge)) + else: + aa_mass = std_aa_mass + aa_mass.update({"C": 160.030649}) # Carbamidomethylated C mass + precursor_mz = fast_mass(peptide, charge=int(charge), aa_mass=aa_mass) mzs, intensities = _peptide_to_peaks(peptide, charge) frags = "\n".join([f"{m} {i}" for m, i in zip(mzs, intensities)]) diff --git a/tests/test_integration.py b/tests/test_integration.py index 60e3977b..4bd55174 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -7,50 +7,8 @@ from casanovo import casanovo -def test_annotate(mgf_small_unannotated, tide_dir_small, tmp_path): - - # Run a command: - run = functools.partial( - CliRunner().invoke, casanovo.main, catch_exceptions=False - ) - - annotate_args = [ - "annotate", - str(mgf_small_unannotated), - str(tide_dir_small), - "--output", - str(tmp_path / "annotated_mgf.mgf"), - ] - - result = run(annotate_args) - - assert result.exit_code == 0 - assert (tmp_path / "annotated_mgf.mgf").exists() - - # Read in the annotated file - with open(tmp_path / "annotated_mgf.mgf") as f: - annotated_lines = f.readlines() - - # Get each SEQ= line - seq_lines = [line for line in annotated_lines if line.startswith("SEQ=")] - assert len(seq_lines) == 3 - assert ( - seq_lines[0].strip() - == "SEQ=LESLIEK,PEPTIDEK,decoy_KEILSEL,decoy_KEDITEPP" - ) - assert ( - seq_lines[1].strip() - == "SEQ=LESLIEK,PEPTIDEK,decoy_KEILSEL,decoy_KEDITEPP" - ) - assert ( - seq_lines[2].strip() == "SEQ=+42.011LEM+15.995SLIM+15.995EK," - "+43.006PEN+0.984PTIQ+0.984DEK,decoy_-17.027KM+15.995EILSEL," - "decoy_+43.006-17.027KEDITEPP,decoy_KEDIQ+0.984TEPPQ+0.984" - ) - - def test_db_search( - mgf_small_unannotated, tide_dir_small, tiny_config, tmp_path, monkeypatch + mgf_db_search, tiny_fasta_file, tiny_config, tmp_path, monkeypatch ): # Run a command: monkeypatch.setattr(casanovo, "__version__", "4.1.0") @@ -58,30 +16,18 @@ def test_db_search( CliRunner().invoke, casanovo.main, catch_exceptions=False ) - annotate_args = [ - "annotate", - str(mgf_small_unannotated), - str(tide_dir_small), - "--output", - str(tmp_path / "annotated_mgf.mgf"), - ] - - result = run(annotate_args) - - assert result.exit_code == 0 - assert (tmp_path / "annotated_mgf.mgf").exists() - - # Follow up annotate run with db search - output_path = tmp_path / "db_search.mztab" search_args = [ "db-search", - str(tmp_path / "annotated_mgf.mgf"), "--config", tiny_config, "--output", str(output_path), + "--precursor_tolerance", + str(100), + str(mgf_db_search), + str(tiny_fasta_file), ] result = run(search_args) @@ -94,34 +40,13 @@ def test_db_search( psms = mztab.spectrum_match_table assert list(psms.sequence) == [ - "LESLIEK", - "PEPTIDEK", - "KEILSEL", - "KEDITEPP", - "LESLIEK", - "PEPTIDEK", - "KEILSEL", - "KEDITEPP", - "+42.011LEM+15.995SLIM+15.995EK", - "+43.006PEN+0.984PTIQ+0.984DEK", - "-17.027KM+15.995EILSEL", - "+43.006-17.027KEDITEPP", - "KEDIQ+0.984TEPPQ+0.984", - ] - assert list(psms["opt_cv_MS:1002217_decoy_peptide"]) == [ - "True", - "True", - "False", - "False", - "True", - "True", - "False", - "False", - "True", - "True", - "False", - "False", - "False", + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", ] diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index ec9085c0..e3707917 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -10,10 +10,11 @@ import numpy as np import pytest import torch +import re from casanovo import casanovo from casanovo import utils -from casanovo.data import ms_io +from casanovo.data import ms_io, db_utils from casanovo.data.datasets import SpectrumDataset, AnnotatedSpectrumDataset from casanovo.denovo.evaluate import aa_match_batch, aa_match_metrics from casanovo.denovo.model import Spec2Pep, _aa_pep_score, _calc_match_score @@ -219,6 +220,433 @@ def test_calc_match_score(): assert np.sum(masked_per_aa_scores.numpy()[3]) == 3 +def test_digest_fasta_cleave(fasta_raw_data): + + with open("temp_fasta", "w") as file: + file.write(fasta_raw_data) + + # No missed cleavages + expected_normal = [ + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + # 1 missed cleavage + expected_1missedcleavage = [ + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "LLIYGASTRATSIPAR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "EIVMTQSPPTLSLSPGERVTLSC+57.021R", + "VTLSC+57.021RASQSVSSSYLTWYQQKPGQAPR", + "ASQSVSSSYLTWYQQKPGQAPRLLIYGASTR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGER", + "ATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + # 3 missed cleavages + expected_3missedcleavage = [ + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "LLIYGASTRATSIPAR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "EIVMTQSPPTLSLSPGERVTLSC+57.021R", + "VTLSC+57.021RASQSVSSSYLTWYQQKPGQAPR", + "ASQSVSSSYLTWYQQKPGQAPRLLIYGASTR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "ASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPAR", + "VTLSC+57.021RASQSVSSSYLTWYQQKPGQAPRLLIYGASTR", + "MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGER", + "ATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "VTLSC+57.021RASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPAR", + "MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSC+57.021R", + "EIVMTQSPPTLSLSPGERVTLSC+57.021RASQSVSSSYLTWYQQKPGQAPR", + "LLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + max_mods=0, + min_length=6, + max_length=50, + ) + peptide_list = [x[0] for x in peptide_list] + assert peptide_list == expected_normal + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=1, + max_mods=0, + min_length=6, + max_length=50, + ) + peptide_list = [x[0] for x in peptide_list] + assert peptide_list == expected_1missedcleavage + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=3, + max_mods=0, + min_length=6, + max_length=50, + ) + peptide_list = [x[0] for x in peptide_list] + assert peptide_list == expected_3missedcleavage + + +def test_digest_fasta_mods(fasta_raw_data): + + with open("temp_fasta", "w") as file: + file.write(fasta_raw_data) + + # 1 modification allowed + # fixed: C+57.02146 + # variable: 1M+15.994915,1N+0.984016,1Q+0.984016 + # nterm: 1X+42.010565,1X+43.005814,1X-17.026549,1X+25.980265 + expected_1mod = [ + "-17.027ATSIPAR", + "ATSIPAR", + "-17.027VTLSC+57.021R", + "VTLSC+57.021R", + "+43.006-17.027ATSIPAR", + "+42.011ATSIPAR", + "+43.006ATSIPAR", + "+43.006-17.027VTLSC+57.021R", + "+42.011VTLSC+57.021R", + "+43.006VTLSC+57.021R", + "-17.027LLIYGASTR", + "LLIYGASTR", + "+43.006-17.027LLIYGASTR", + "+42.011LLIYGASTR", + "+43.006LLIYGASTR", + "-17.027EIVMTQSPPTLSLSPGER", + "EIVMTQSPPTLSLSPGER", + "EIVMTQ+0.984SPPTLSLSPGER", + "EIVM+15.995TQSPPTLSLSPGER", + "+43.006-17.027EIVMTQSPPTLSLSPGER", + "+42.011EIVMTQSPPTLSLSPGER", + "+43.006EIVMTQSPPTLSLSPGER", + "-17.027MEAPAQLLFLLLLWLPDTTR", + "MEAPAQLLFLLLLWLPDTTR", + "MEAPAQ+0.984LLFLLLLWLPDTTR", + "M+15.995EAPAQLLFLLLLWLPDTTR", + "+43.006-17.027MEAPAQLLFLLLLWLPDTTR", + "+42.011MEAPAQLLFLLLLWLPDTTR", + "+43.006MEAPAQLLFLLLLWLPDTTR", + "-17.027ASQSVSSSYLTWYQQKPGQAPR", + "ASQSVSSSYLTWYQQKPGQAPR", + "ASQ+0.984SVSSSYLTWYQQKPGQAPR", + "ASQSVSSSYLTWYQ+0.984QKPGQAPR", + "ASQSVSSSYLTWYQQ+0.984KPGQAPR", + "ASQSVSSSYLTWYQQKPGQ+0.984APR", + "+43.006-17.027ASQSVSSSYLTWYQQKPGQAPR", + "+42.011ASQSVSSSYLTWYQQKPGQAPR", + "+43.006ASQSVSSSYLTWYQQKPGQAPR", + "-17.027FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "FSGSGSGTDFTLTISSLQ+0.984PEDFAVYYC+57.021QQDYNLP", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021Q+0.984QDYNLP", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQ+0.984DYNLP", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYN+0.984LP", + "+43.006-17.027FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "+42.011FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "+43.006FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + max_mods=1, + min_length=6, + max_length=50, + ) + peptide_list = [x[0] for x in peptide_list] + peptide_list = [ + x + for x in peptide_list + if not re.search( + r"(\+42\.011|\+43\.006|\-17\.027|\+43\.006\-17\.027)+[A-Z]\+", x + ) + ] + assert peptide_list == expected_1mod + + +def test_length_restrictions(fasta_raw_data): + + with open("temp_fasta", "w") as file: + file.write(fasta_raw_data) + + # length between 20 and 50 + expected_long = [ + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + # length between 6 and 8 + expected_short = ["ATSIPAR", "VTLSC+57.021R"] + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + max_mods=0, + min_length=20, + max_length=50, + ) + peptide_list = [x[0] for x in peptide_list] + assert peptide_list == expected_long + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + max_mods=0, + min_length=6, + max_length=8, + ) + peptide_list = [x[0] for x in peptide_list] + assert peptide_list == expected_short + + +def test_digest_fasta_enzyme(fasta_raw_data): + + with open("temp_fasta", "w") as file: + file.write(fasta_raw_data) + + # arg-c enzyme + expected_argc = [ + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + # asp-n enzyme + expected_aspn = ["DFAVYYC+57.021QQ", "DFTLTISSLQPE", "MEAPAQLLFLLLLWLP"] + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="arg-c", + digestion="full", + missed_cleavages=0, + max_mods=0, + min_length=6, + max_length=50, + ) + peptide_list = [x[0] for x in peptide_list] + assert peptide_list == expected_argc + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="asp-n", + digestion="full", + missed_cleavages=0, + max_mods=0, + min_length=6, + max_length=50, + ) + peptide_list = [x[0] for x in peptide_list] + assert peptide_list == expected_aspn + + +def test_get_candidates(fasta_raw_data): + + with open("temp_fasta", "w") as file: + file.write(fasta_raw_data) + + # precursor_window is 10000 + expected_smallwindow = ["LLIYGASTR"] + + # precursor window is 150000 + expected_midwindow = ["LLIYGASTR"] + + # precursor window is 600000 + expected_widewindow = ["ATSIPAR", "VTLSC+57.021R", "LLIYGASTR"] + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=1, + max_mods=0, + min_length=6, + max_length=50, + ) + + candidates = db_utils.get_candidates( + precursor_mz=496.2, + charge=2, + peptide_list=peptide_list, + precursor_tolerance=10000, + isotope_error="0", + ) + candidates = [x[0] for x in candidates] + assert expected_smallwindow == candidates + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=1, + max_mods=0, + min_length=6, + max_length=50, + ) + + candidates = db_utils.get_candidates( + precursor_mz=496.2, + charge=2, + peptide_list=peptide_list, + precursor_tolerance=150000, + isotope_error="0", + ) + candidates = [x[0] for x in candidates] + assert expected_midwindow == candidates + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=1, + max_mods=0, + min_length=6, + max_length=50, + ) + + candidates = db_utils.get_candidates( + precursor_mz=496.2, + charge=2, + peptide_list=peptide_list, + precursor_tolerance=600000, + isotope_error="0", + ) + candidates = [x[0] for x in candidates] + assert expected_widewindow == candidates + + +def test_get_candidates_isotope_error(): + + # Tide isotope error windows for 496.2, 2+: + # 0: [980.481617, 1000.289326] + # 1: [979.491114, 999.278813] + # 2: [978.500611, 998.268300] + # 3: [977.510108, 997.257787] + + peptide_list = [ + ("A", 1001), + ("B", 1000), + ("C", 999), + ("D", 998), + ("E", 997), + ("F", 996), + ("G", 995), + ("H", 994), + ("I", 993), + ("J", 992), + ("K", 991), + ("L", 990), + ("M", 989), + ("N", 988), + ("O", 987), + ("P", 986), + ("Q", 985), + ("R", 984), + ("S", 983), + ("T", 982), + ("U", 981), + ("V", 980), + ("W", 979), + ("X", 978), + ("Y", 977), + ("Z", 976), + ] + + peptide_list.sort(key=lambda x: x[1]) + + expected_isotope0 = list("UTSRQPONMLKJIHGFEDCB") + expected_isotope1 = list("VUTSRQPONMLKJIHGFEDC") + expected_isotope2 = list("WVUTSRQPONMLKJIHGFED") + expected_isotope3 = list("XWVUTSRQPONMLKJIHGFE") + expected_isotope0123 = list("XWVUTSRQPONMLKJIHGFEDCB") + + candidates = db_utils.get_candidates( + precursor_mz=496.2, + charge=2, + peptide_list=peptide_list, + precursor_tolerance=10000, + isotope_error="0", + ) + candidates = [x[0] for x in candidates] + assert expected_isotope0 == candidates + + candidates = db_utils.get_candidates( + precursor_mz=496.2, + charge=2, + peptide_list=peptide_list, + precursor_tolerance=10000, + isotope_error="1", + ) + candidates = [x[0] for x in candidates] + assert expected_isotope1 == candidates + + candidates = db_utils.get_candidates( + precursor_mz=496.2, + charge=2, + peptide_list=peptide_list, + precursor_tolerance=10000, + isotope_error="2", + ) + candidates = [x[0] for x in candidates] + assert expected_isotope2 == candidates + + candidates = db_utils.get_candidates( + precursor_mz=496.2, + charge=2, + peptide_list=peptide_list, + precursor_tolerance=10000, + isotope_error="3", + ) + candidates = [x[0] for x in candidates] + assert expected_isotope3 == candidates + + candidates = db_utils.get_candidates( + precursor_mz=496.2, + charge=2, + peptide_list=peptide_list, + precursor_tolerance=10000, + isotope_error="0,1,2,3", + ) + candidates = [x[0] for x in candidates] + assert expected_isotope0123 == candidates + + def test_beam_search_decode(): """ Test beam search decoding and its sub-functions. From e2ce3172c89a5c4fc74256689fa3cdf6b01d1faf Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 2 Jul 2024 20:20:25 -0700 Subject: [PATCH 22/84] fix for issue with 0 candidates --- casanovo/denovo/model.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 4d9bd41b..02a324d3 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1071,15 +1071,13 @@ def smart_batch_gen(self, spectrum_batch): self.precursor_tolerance, self.isotope_error, ) - logger.debug("%s", digest_data) try: spec_peptides, pep_masses, pep_protein = list( zip(*digest_data) ) except ValueError: - logger.info( - "No peptides found for precursor %s", precursors[idx] - ) + logger.info("No peptides found for spectrum %s", indexes[idx]) + continue spec_precursors = [precursors[idx]] * len(spec_peptides) spec_enc = [enc[idx]] * len(spec_peptides) spec_idx = [indexes[idx]] * len(spec_peptides) From 5ef27e0c7dfffd219e5b248205a7ced0187ce4bb Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Wed, 3 Jul 2024 11:33:36 -0700 Subject: [PATCH 23/84] minor fixes added --- casanovo/data/datasets.py | 2 - casanovo/denovo/dataloaders.py | 13 --- casanovo/denovo/model.py | 31 +++--- casanovo/denovo/model_runner.py | 2 +- tests/conftest.py | 164 +++++++++----------------------- 5 files changed, 67 insertions(+), 145 deletions(-) diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py index 59f56b68..6244e88f 100644 --- a/casanovo/data/datasets.py +++ b/casanovo/data/datasets.py @@ -134,8 +134,6 @@ def _process_peaks( The precursor m/z. precursor_charge : int The precursor charge. - track_spectrum_id : Optional[bool] - Whether to keep track of the identifier of the MS/MS spectra. Returns ------- diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index ba02936c..97bfb2fc 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -125,17 +125,6 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: ) if self.test_index is not None: self.test_dataset = make_dataset(self.test_index) - if stage == "db": - make_dataset = functools.partial( - SpectrumDataset, - n_peaks=self.n_peaks, - min_mz=self.min_mz, - max_mz=self.max_mz, - min_intensity=self.min_intensity, - remove_precursor_tol=self.remove_precursor_tol, - ) - if self.test_index is not None: - self.test_dataset = make_dataset(self.test_index) def _make_loader( self, @@ -154,8 +143,6 @@ def _make_loader( The batch size to use. shuffle : bool Option to shuffle the batches. - db_mode : bool - Option to use the DataLoader for Casanovo-DB. Returns ------- diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 02a324d3..312e7f92 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -12,7 +12,6 @@ import numpy as np import lightning.pytorch as pl from torch.utils.tensorboard import SummaryWriter -from pyteomics import mass from depthcharge.components import ModelMixin, PeptideDecoder, SpectrumEncoder from . import evaluate @@ -992,10 +991,19 @@ def configure_optimizers( class DbSpec2Pep(Spec2Pep): """ - Inherits Spec2Pep + Subclass of Spec2Pep for the use of Casanovo as an MS/MS database search score function. - Hijacks teacher-forcing implemented in Spec2Pep and - uses it to predict scores between a spectra and associated peptide. + Uses teacher forcing to 'query' Casanovo for its score for each AA + within a candidate peptide, and takes the geometric average of these scores + and reports this as the score for the spectrum-peptide pair. Note that the + geometric mean of the AA scores is actually calculated by a + summation and average of the log of the scores, to preserve numerical + stability. This does not affect PSM ranking. + + Also note that although teacher-forcing is used within this method, + there is *no training* involved. This is a prediction-only method. + + Output is provided in .mztab format. """ def __init__(self, *args, **kwargs): @@ -1119,7 +1127,6 @@ def on_predict_batch_end( per_aa_score, precursors, ) in outputs: - prec_mass = precursors[:, 0] prec_charge = precursors[:, 1] prec_mz = precursors[:, 2] calc_mz = [ @@ -1140,9 +1147,9 @@ def on_predict_batch_end( def _calc_match_score( batch_all_aa_scores: torch.Tensor, - truth_aa_indicies: torch.Tensor, + truth_aa_indices: torch.Tensor, decoder_reverse: bool = False, -) -> List[float]: +) -> Tuple[torch.Tensor, torch.Tensor]: """ Calculate the score between the input spectra and associated peptide. @@ -1158,7 +1165,7 @@ def _calc_match_score( Amino acid scores for all amino acids in the vocabulary for every prediction made to generate the associated peptide (for an entire batch) - truth_aa_indicies : torch.Tensor + truth_aa_indices : torch.Tensor Indicies of the score for each actual amino acid in the peptide (for an entire batch) decoder_reverse : bool @@ -1166,7 +1173,7 @@ def _calc_match_score( Returns ------- - score : list[float], list[list[float]] + (all_scores, per_aa_scores) : Tuple[torch.Tensor, torch.Tensor] The score between the input spectra and associated peptide (for an entire batch) a list of lists of per amino acid scores @@ -1175,7 +1182,7 @@ def _calc_match_score( # Remove trailing tokens from predictions based on decoder reversal if decoder_reverse: batch_all_aa_scores = batch_all_aa_scores[:, 1:] - elif not decoder_reverse: + else: batch_all_aa_scores = batch_all_aa_scores[:, :-1] # Vectorized scoring using efficient indexing. @@ -1186,10 +1193,10 @@ def _calc_match_score( ) cols = torch.arange(0, batch_all_aa_scores.shape[1]).expand_as(rows) - per_aa_scores = batch_all_aa_scores[rows, cols, truth_aa_indicies] + per_aa_scores = batch_all_aa_scores[rows, cols, truth_aa_indices] per_aa_scores[per_aa_scores == 0] += 1e-10 - score_mask = truth_aa_indicies != 0 + score_mask = truth_aa_indices != 0 per_aa_scores[~score_mask] = 0 log_per_aa_scores = torch.log(per_aa_scores) all_scores = torch.where( diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 284acbe8..865df71b 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -151,7 +151,7 @@ def db_search( test_index = self._get_index(peak_path, False, "db search") self.writer.set_ms_run(test_index.ms_files) self.initialize_data_module(test_index=test_index) - self.loaders.setup(stage="db") + self.loaders.setup(stage="test", annotated=False) self.trainer.predict(self.model, self.loaders.predict_dataloader()) def train( diff --git a/tests/conftest.py b/tests/conftest.py index cac1a873..b2244308 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,7 +16,37 @@ def mgf_small(tmp_path): return _create_mgf(peptides, mgf_file) -def _create_mgf(peptides, mgf_file, random_state=42): +@pytest.fixture +def tiny_fasta_file(tmp_path, fasta_raw_data): + fasta_file = tmp_path / "tiny_fasta.fasta" + with fasta_file.open("w+") as fasta_ref: + fasta_ref.write(fasta_raw_data) + + return fasta_file + + +@pytest.fixture +def fasta_raw_data(): + return ">foo\nMEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP" + + +@pytest.fixture +def mgf_db_search(tmp_path): + """An MGF file with 7 spectra and scan numbers, C+57.021 mass modification considered""" + peptides = [ + "ATSIPAR", + "VTLSCR", + "LLIYGASTR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP", + ] + mgf_file = tmp_path / "db_search.mgf" + return _create_mgf(peptides, mgf_file, c_mod=True) + + +def _create_mgf(peptides, mgf_file, random_state=42, c_mod=False): """ Create a fake MGF file from one or more peptides. @@ -28,20 +58,25 @@ def _create_mgf(peptides, mgf_file, random_state=42): The MGF file to create. random_state : int or numpy.random.Generator, optional The random seed. The charge states are chosen to be 2 or 3 randomly. + c_mod : bool, optional + Whether to use the constant carbamidomethylation + of C in mass calculations. Returns ------- mgf_file : Path """ rng = np.random.default_rng(random_state) - entries = [_create_mgf_entry(p, rng.choice([2, 3])) for p in peptides] + entries = [ + _create_mgf_entry(p, rng.choice([2, 3]), c_mod) for p in peptides + ] with mgf_file.open("w+") as mgf_ref: mgf_ref.write("\n".join(entries)) return mgf_file -def _create_mgf_entry(peptide, charge=2): +def _create_mgf_entry(peptide, charge=2, c_mod=False): """ Create a MassIVE-KB style MGF entry for a single PSM. @@ -51,13 +86,21 @@ def _create_mgf_entry(peptide, charge=2): A peptide sequence. charge : int, optional The peptide charge state. + c_mod : bool, optional + Whether to use the constant carbamidomethylation + of C in mass calculations. Returns ------- str The PSM entry in an MGF file format. """ - precursor_mz = calculate_mass(peptide, charge=int(charge)) + if not c_mod: + precursor_mz = calculate_mass(peptide, charge=int(charge)) + else: + aa_mass = std_aa_mass + aa_mass.update({"C": 160.030649}) # Carbamidomethylated C mass + precursor_mz = fast_mass(peptide, charge=int(charge), aa_mass=aa_mass) mzs, intensities = _peptide_to_peaks(peptide, charge) frags = "\n".join([f"{m} {i}" for m, i in zip(mzs, intensities)]) @@ -263,119 +306,6 @@ def tiny_config(tmp_path): return cfg_file -@pytest.fixture -def tiny_fasta_file(tmp_path, fasta_raw_data): - fasta_file = tmp_path / "tiny_fasta.fasta" - with fasta_file.open("w+") as fasta_ref: - fasta_ref.write(fasta_raw_data) - - return fasta_file - - -@pytest.fixture -def fasta_raw_data(): - return ">foo\nMEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP" - - -@pytest.fixture -def mgf_db_search(tmp_path): - """An MGF file with 2 unannotated spectra and scan numbers.""" - peptides = [ - "ATSIPAR", - "VTLSCR", - "LLIYGASTR", - "EIVMTQSPPTLSLSPGER", - "MEAPAQLLFLLLLWLPDTTR", - "ASQSVSSSYLTWYQQKPGQAPR", - "FSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP", - ] - mgf_file = tmp_path / "db_search.mgf" - return _create_unannotated_mgf(peptides, mgf_file, c_mod=True) - - -@pytest.fixture -def mgf_small_unannotated(tmp_path): - """An MGF file with 2 unannotated spectra and scan numbers.""" - peptides = ["LESLIEK", "PEPTIDEK", "LESTIEK"] - mgf_file = tmp_path / "small_unannotated.mgf" - return _create_unannotated_mgf(peptides, mgf_file) - - -def _create_unannotated_mgf(peptides, mgf_file, random_state=999, c_mod=False): - """ - Create a fake MGF file from one or more peptides. - This file will have no SEQ= parameter, but will have a SCANS= parameter. - - Parameters - ---------- - peptides : str or list of str - The peptides for which to create spectra. - mgf_file : Path - The MGF file to create. - random_state : int or numpy.random.Generator, optional - The random seed. The charge states are chosen to be 2 or 3 randomly. - c_mod : bool, optional - Whether to use the constant carbamidomethylation - of C in mass calculations. - - Returns - ------- - mgf_file : Path - """ - rng = np.random.default_rng(random_state) - entries = [ - _create_unannotated_mgf_entry(p, idx, rng.choice([2, 3]), c_mod=c_mod) - for idx, p in enumerate(peptides) - ] - with mgf_file.open("w+") as mgf_ref: - mgf_ref.write("\n".join(entries)) - - return mgf_file - - -def _create_unannotated_mgf_entry(peptide, scan_num, charge, c_mod=False): - """ - Create a MassIVE-KB style MGF entry for a single PSM. - Each entry will have no SEQ= parameter, but will have a SCANS= parameter. - - Parameters - ---------- - peptide : str - A peptide sequence. - scan_num : int - The scan number. - charge : int, optional - The peptide charge state. - c_mod : bool, optional - Whether to use the constant carbamidomethylation - of C in mass calculations. - - Returns - ------- - str - The PSM entry in an MGF file format. - """ - if not c_mod: - precursor_mz = calculate_mass(peptide, charge=int(charge)) - else: - aa_mass = std_aa_mass - aa_mass.update({"C": 160.030649}) # Carbamidomethylated C mass - precursor_mz = fast_mass(peptide, charge=int(charge), aa_mass=aa_mass) - mzs, intensities = _peptide_to_peaks(peptide, charge) - frags = "\n".join([f"{m} {i}" for m, i in zip(mzs, intensities)]) - - mgf = [ - "BEGIN IONS", - f"TITLE=title::{scan_num}", - f"PEPMASS={precursor_mz}", - f"CHARGE={charge}+", - f"SCANS={scan_num}", - f"{frags}", - "END IONS", - ] - return "\n".join(mgf) - - @pytest.fixture def tide_dir_small(tmp_path): """A directory with a very small TIDE search result.""" From 5f0675f032579e2976718c619969bdfd47cc68c5 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Wed, 3 Jul 2024 14:20:56 -0700 Subject: [PATCH 24/84] reordered and renamed variables for consistency --- casanovo/denovo/model.py | 45 ++++++++++++++++++--------------- casanovo/denovo/model_runner.py | 10 ++++---- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 312e7f92..8bb0dbee 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1021,30 +1021,34 @@ def predict_step(self, batch, *args): Returns ------- - predictions: List[Tuple[int, str, float, np.ndarray, np.ndarray]] + predictions: List[Tuple[int, int, float, str, np.ndarray, np.ndarray]] Model predictions for the given batch of spectra containing spectrum - scan number, peptide sequence, Casanovo-DB score, - amino acid-level confidence scores, and precursor information. + ids, precursor charge and m/z, candidate peptide sequences, peptide + scores, and amino acid-level scores. """ batch_res = [] for ( - indexes, + spectrum_i, peptides, precursors, encoded_ms, ) in self.smart_batch_gen(batch): pred, truth = self.decoder(peptides, precursors, *encoded_ms) pred = self.softmax(pred) - score_result, per_aa_score = _calc_match_score( + peptide_scores, aa_scores = _calc_match_score( pred, truth, self.decoder.reverse ) + precursor_info = precursors.cpu().detach().numpy() + precursor_charge = precursor_info[:, 1] + precursor_mz = precursor_info[:, 2] batch_res.append( ( - indexes, + spectrum_i, + precursor_charge, + precursor_mz, peptides, - score_result.cpu().detach().numpy(), - per_aa_score.cpu().detach().numpy(), - precursors.cpu().detach().numpy(), + peptide_scores.cpu().detach().numpy(), + aa_scores.cpu().detach().numpy(), ) ) return batch_res @@ -1121,26 +1125,25 @@ def on_predict_batch_end( if self.out_writer is None: return for ( - indexes, + spectrum_i, + precursor_charge, + precursor_mz, peptides, - score_result, - per_aa_score, - precursors, + peptide_scores, + aa_scores, ) in outputs: - prec_charge = precursors[:, 1] - prec_mz = precursors[:, 2] calc_mz = [ self.peptide_mass_calculator.mass(peptide, charge) - for peptide, charge in zip(peptides, prec_charge) + for peptide, charge in zip(peptides, precursor_charge) ] for row in zip( peptides, - score_result, - prec_charge, - prec_mz, + peptide_scores, + precursor_charge, + precursor_mz, calc_mz, - indexes, - per_aa_score, + spectrum_i, + aa_scores, ): self.out_writer.psms.append(row) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 865df71b..1457df38 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -333,12 +333,12 @@ def initialize_model( if self.model_filename is None: # Train a model from scratch if no model file is provided. + if db_search: + logger.error("DB search mode requires a model file") + raise ValueError( + "A model file must be provided for DB search mode" + ) if train: - if db_search: - logger.error("Db search mode requires a model file.") - raise ValueError( - "A model file must be provided for DB search mode" - ) self.model = Spec2Pep(**model_params) return # Else we're not training, so a model file must be provided. From b4fd8ff05eaebcf62351627c8ceee2fee3bc23a1 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Thu, 4 Jul 2024 14:39:36 -0700 Subject: [PATCH 25/84] casanovo-db full working version with code simplification --- casanovo/data/db_utils.py | 16 +++- casanovo/data/ms_io.py | 80 +--------------- casanovo/denovo/dataloaders.py | 111 ++++++++++++++++++++++ casanovo/denovo/model.py | 159 +++++++++++--------------------- casanovo/denovo/model_runner.py | 32 ++++--- 5 files changed, 198 insertions(+), 200 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 341a6162..921c75bd 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -4,9 +4,12 @@ import depthcharge.masses from pyteomics import fasta, parser import bisect +import logging from typing import List, Tuple +logger = logging.getLogger("casanovo") + # CONSTANTS HYDROGEN = 1.007825035 OXYGEN = 15.99491463 @@ -96,17 +99,22 @@ def digest_fasta( semi=semi, ) protein = header.split()[0] - peptide_list.extend([(pep, protein) for pep in pep_set]) + for pep in pep_set: + if len(pep) < min_length or len(pep) > max_length: + continue + if "X" in pep or "U" in pep: + logger.warn( + "Skipping peptide with ambiguous amino acids: %s", pep + ) + continue + peptide_list.append((pep, protein)) else: raise ValueError(f"Digestion type {digestion} not recognized.") # Generate modified peptides mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") - mass_calculator.masses.update({"X": 0.0}) # TODO: REMOVE? mod_peptide_list = [] for pep, prot in peptide_list: - if len(pep) < min_length or len(pep) > max_length: - continue peptide_isoforms = parser.isoforms( pep, variable_mods=var_mods, diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index a701b627..b27f083b 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -22,13 +22,10 @@ class MztabWriter: ---------- filename : str The name of the mzTab file. - is_db_variant : bool - Whether the mzTab file is for a Casanovo-DB search. """ - def __init__(self, filename: str, is_db_variant: bool = False): + def __init__(self, filename: str): self.filename = filename - self.is_db_variant = is_db_variant self.metadata = [ ("mzTab-version", "1.0.0"), ("mzTab-mode", "Summary"), @@ -150,9 +147,6 @@ def save(self) -> None: """ Export the spectrum identifications to the mzTab file. """ - if self.is_db_variant: - self.save_db_variant() - return with open(self.filename, "w", newline="") as f: writer = csv.writer(f, delimiter="\t", lineterminator=os.linesep) # Write metadata. @@ -192,7 +186,7 @@ def save(self) -> None: "PSM", psm[0], # sequence i, # PSM_ID - "null", # accession + "null" if len(psm) < 8 else psm[7], # accession "null", # unique "null", # database "null", # database_version @@ -215,73 +209,3 @@ def save(self) -> None: psm[6], # opt_ms_run[1]_aa_scores ] ) - - def save_db_variant(self) -> None: - """ - Export the Casanovo-DB search results to the mzTab file. - - Outputs PSMs in the order they were scored - (i.e. the order in the .mgf file). - """ - with open(self.filename, "w", newline="") as f: - writer = csv.writer(f, delimiter="\t", lineterminator=os.linesep) - # Write metadata. - for row in self.metadata: - writer.writerow(["MTD", *row]) - # Write PSMs. - writer.writerow( - [ - "PSH", - "sequence", - "PSM_ID", - "accession", - "unique", - "database", - "database_version", - "search_engine", - "search_engine_score[1]", - "modifications", - "retention_time", - "charge", - "exp_mass_to_charge", - "calc_mass_to_charge", - "spectra_ref", - "pre", - "post", - "start", - "end", - "opt_ms_run[1]_aa_scores", - ] - ) - for i, psm in enumerate(self.psms): - writer.writerow( - [ - "PSM", - psm[0], # sequence - f"{psm[5]}:{i}", # PSM_ID (spectrum # :candidate #) - "null", # accession - "null", # unique - "null", # database - "null", # database_version - "null", # search_engine - psm[1], # search_engine_score[1] - "null", # modifications - "null", # retention_time - int(psm[2]), # charge - psm[3], # exp_mass_to_charge - psm[4], # calc_mass_to_charge - psm[5], # spectra_ref - "null", # pre - "null", # post - "null", # start - "null", # end - ",".join( - list( - map( - "{:.5f}".format, - psm[6][psm[6] != 0], - ) - ) - ), # opt_ms_run[1]_aa_scores - ] - ) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 97bfb2fc..80a4f7dc 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -3,6 +3,8 @@ import functools import os from typing import List, Optional, Tuple +from functools import partial +import logging import lightning.pytorch as pl import numpy as np @@ -13,6 +15,9 @@ AnnotatedSpectrumDataset, SpectrumDataset, ) +from ..data import db_utils + +logger = logging.getLogger("casanovo") class DeNovoDataModule(pl.LightningDataModule): @@ -176,6 +181,22 @@ def predict_dataloader(self) -> torch.utils.data.DataLoader: """Get the predict DataLoader.""" return self._make_loader(self.test_dataset, self.eval_batch_size) + def db_dataloader(self) -> torch.utils.data.DataLoader: + """Get a special dataloader for DB search""" + return torch.utils.data.DataLoader( + self.test_dataset, + batch_size=self.eval_batch_size, + collate_fn=partial( + prepare_psm_batch, + digest=self.digest, + precursor_tolerance=self.precursor_tolerance, + isotope_error=self.isotope_error, + ), + pin_memory=True, + num_workers=self.n_workers, + shuffle=False, + ) + def prepare_batch( batch: List[Tuple[torch.Tensor, float, int, str]] @@ -214,3 +235,93 @@ def prepare_batch( [precursor_masses, precursor_charges, precursor_mzs] ).T.float() return spectra, precursors, np.asarray(spectrum_ids) + + +def prepare_psm_batch( + batch: List[Tuple[torch.Tensor, float, int, str]], + digest: List[Tuple[str, float, str]], + precursor_tolerance: float, + isotope_error: str, +): + """ + Collate MS/MS spectra into a batch for DB search. + + The MS/MS spectra will be padded so that they fit nicely as a tensor. + However, the padded elements are ignored during the subsequent steps. + + Parameters + ---------- + batch : List[Tuple[torch.Tensor, float, int, str]] + A batch of data from an AnnotatedSpectrumDataset, consisting of for each + spectrum (i) a tensor with the m/z and intensity peak values, (ii), the + precursor m/z, (iii) the precursor charge, (iv) the spectrum identifier. + digest : List[Tuple[str, float, str]] + A list of tuples containing the peptide sequence, mass, and associated protein + from digesting a .fasta file. Sorted by mass in ascending order. Uses neutral masses. + precursor_tolerance : float + The precursor mass tolerance in parts-per-million. + isotope_error : str + The isotope error levels to consider. + + Returns + ------- + all_spectra : torch.Tensor of shape (batch_size, n_peaks, 2) + The padded mass spectra tensor with the m/z and intensity peak values + for each spectrum. + all_precursors : torch.Tensor of shape (batch_size, 3) + A tensor with the precursor neutral mass, precursor charge, and + precursor m/z. + all_spectrum_ids : np.ndarray + The spectrum identifiers. + all_peptides : List[str] + The candidate peptides for each spectrum. + all_proteins : List[str] + The associated proteins for each candidate peptide. + """ + spectra, precursor_mzs, precursor_charges, spectrum_ids = list(zip(*batch)) + spectra = torch.nn.utils.rnn.pad_sequence(spectra, batch_first=True) + + precursor_mzs = torch.tensor(precursor_mzs) + precursor_charges = torch.tensor(precursor_charges) + precursor_masses = (precursor_mzs - 1.007276) * precursor_charges + precursors = torch.vstack( + [precursor_masses, precursor_charges, precursor_mzs] + ).T.float() + + all_spectra = [] + all_precursors = [] + all_spectrum_ids = [] + all_peptides = [] + all_proteins = [] + for idx in range(len(batch)): + digest_data = db_utils.get_candidates( + precursor_mzs[idx], + precursor_charges[idx], + digest, + precursor_tolerance, + isotope_error, + ) + try: + spec_peptides, _, pep_protein = list(zip(*digest_data)) + all_spectra.append( + spectra[idx].unsqueeze(0).repeat(len(spec_peptides), 1, 1) + ) + all_precursors.append( + precursors[idx].unsqueeze(0).repeat(len(spec_peptides), 1) + ) + all_spectrum_ids.extend([spectrum_ids[idx]] * len(spec_peptides)) + all_peptides.extend(spec_peptides) + all_proteins.extend(pep_protein) + except ValueError: + logger.warning( + "No candidates found for spectrum %s", spectrum_ids[idx] + ) + continue + + return ( + torch.cat(all_spectra, dim=0), + torch.cat(all_precursors, dim=0), + all_spectrum_ids, + all_peptides, + all_proteins, + ) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 8bb0dbee..2256946c 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1008,6 +1008,7 @@ class DbSpec2Pep(Spec2Pep): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.total_psms = 0 def predict_step(self, batch, *args): """ @@ -1015,137 +1016,85 @@ def predict_step(self, batch, *args): Parameters ---------- - batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor] + batch : Tuple[torch.Tensor, torch.Tensor, np.array, List[str], List[str]] A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - spectrum identifiers as torch Tensors. + spectrum identifiers, (iv) candidate peptides, (v) associated proteins. Returns ------- - predictions: List[Tuple[int, int, float, str, np.ndarray, np.ndarray]] + predictions: List[Tuple[int, int, float, str, np.ndarray, np.ndarray, str]] Model predictions for the given batch of spectra containing spectrum ids, precursor charge and m/z, candidate peptide sequences, peptide - scores, and amino acid-level scores. + scores, amino acid-level scores, and associated proteins. """ - batch_res = [] + predictions = [] + pred, truth = self.decoder(batch[3], batch[1], *self.encoder(batch[0])) + pred = self.softmax(pred) + all_scores, per_aa_scores = _calc_match_score( + pred, truth, self.decoder.reverse + ) for ( + precursor_charge, + precursor_mz, spectrum_i, - peptides, - precursors, - encoded_ms, - ) in self.smart_batch_gen(batch): - pred, truth = self.decoder(peptides, precursors, *encoded_ms) - pred = self.softmax(pred) - peptide_scores, aa_scores = _calc_match_score( - pred, truth, self.decoder.reverse - ) - precursor_info = precursors.cpu().detach().numpy() - precursor_charge = precursor_info[:, 1] - precursor_mz = precursor_info[:, 2] - batch_res.append( + peptide_score, + aa_scores, + peptide, + protein, + ) in zip( + batch[1][:, 1].cpu().detach().numpy(), + batch[1][:, 2].cpu().detach().numpy(), + batch[2], + all_scores.cpu().detach().numpy(), + per_aa_scores.cpu().detach().numpy(), + batch[3], + batch[4], + ): + predictions.append( ( spectrum_i, precursor_charge, precursor_mz, - peptides, - peptide_scores.cpu().detach().numpy(), - aa_scores.cpu().detach().numpy(), - ) - ) - return batch_res - - def smart_batch_gen(self, spectrum_batch): - """ - Transforms a batch of spectra into multiple equally-sized batches of PSMs. - - Parameters - ---------- - spectrum batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor] - A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - spectrum identifiers as torch Tensors. - - Yields - ------- - psm_batch: Tuple[List[int], List[str], torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] - A batch of PSMs containing the spectrum index, peptide sequence, - precursor information, and encoded MS/MS spectra. - """ - all_psm = [] - batch_size = len(spectrum_batch[0]) - enc = self.encoder(spectrum_batch[0]) - enc = list(zip(*enc)) - precursors = spectrum_batch[1] - indexes = spectrum_batch[2] - for idx in range(batch_size): - digest_data = db_utils.get_candidates( - precursors[idx][2], - precursors[idx][1], - self.digest, - self.precursor_tolerance, - self.isotope_error, - ) - try: - spec_peptides, pep_masses, pep_protein = list( - zip(*digest_data) - ) - except ValueError: - logger.info("No peptides found for spectrum %s", indexes[idx]) - continue - spec_precursors = [precursors[idx]] * len(spec_peptides) - spec_enc = [enc[idx]] * len(spec_peptides) - spec_idx = [indexes[idx]] * len(spec_peptides) - all_psm.extend( - list( - zip( - spec_enc, - spec_precursors, - spec_peptides, - spec_idx, - ) + peptide, + peptide_score, + aa_scores, + protein, ) ) - # Continually grab num_pairs items from all_psm until list is exhausted - while len(all_psm) > 0: - psm_batch = all_psm[:batch_size] - all_psm = all_psm[batch_size:] - psm_batch = list(zip(*psm_batch)) - encoded_ms = ( - torch.stack([a[0] for a in psm_batch[0]]), - torch.stack([a[1] for a in psm_batch[0]]), - ) - prec_data = torch.stack(psm_batch[1]) - pep_str = list(psm_batch[2]) - indexes = [a[1] for a in psm_batch[3]] - yield (indexes, pep_str, prec_data, encoded_ms) + self.total_psms += len(predictions) + return predictions def on_predict_batch_end( self, outputs: List[Tuple[np.ndarray, List[str], torch.Tensor]], *args, ) -> None: - if self.out_writer is None: - return + """ + Write the database search results to the output file. + """ for ( spectrum_i, - precursor_charge, + charge, precursor_mz, - peptides, - peptide_scores, + peptide, + peptide_score, aa_scores, + protein, ) in outputs: - calc_mz = [ - self.peptide_mass_calculator.mass(peptide, charge) - for peptide, charge in zip(peptides, precursor_charge) - ] - for row in zip( - peptides, - peptide_scores, - precursor_charge, - precursor_mz, - calc_mz, - spectrum_i, - aa_scores, - ): - self.out_writer.psms.append(row) + if len(peptide) == 0: + continue + self.out_writer.psms.append( + ( + peptide, + tuple(spectrum_i), + peptide_score, + charge, + precursor_mz, + self.peptide_mass_calculator.mass(peptide, charge), + ",".join(list(map("{:.5f}".format, aa_scores))), + protein, + ), + ) def _calc_match_score( diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 1457df38..3286f4b8 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -10,6 +10,8 @@ from pathlib import Path from typing import Iterable, List, Optional, Union +import time + import lightning.pytorch as pl import numpy as np import torch @@ -124,19 +126,21 @@ def db_search( ------- self """ - self.writer = ms_io.MztabWriter( - Path(output).with_suffix(".mztab"), is_db_variant=True - ) + self.writer = ms_io.MztabWriter(Path(output).with_suffix(".mztab")) self.writer.set_metadata( self.config, model=str(self.model_filename), config_filename=self.config.file, ) - self.initialize_trainer(train=True) self.initialize_model(train=False, db_search=True) self.model.out_writer = self.writer - self.model.digest = db_utils.digest_fasta( + test_index = self._get_index(peak_path, False, "db search") + self.writer.set_ms_run(test_index.ms_files) + + self.initialize_data_module(test_index=test_index) + self.loaders.setup(stage="test", annotated=False) + self.loaders.digest = db_utils.digest_fasta( fasta_path, enzyme, digestion, @@ -145,14 +149,16 @@ def db_search( min_length, max_length, ) - self.model.precursor_tolerance = precursor_tolerance - self.model.isotope_error = isotope_error - - test_index = self._get_index(peak_path, False, "db search") - self.writer.set_ms_run(test_index.ms_files) - self.initialize_data_module(test_index=test_index) - self.loaders.setup(stage="test", annotated=False) - self.trainer.predict(self.model, self.loaders.predict_dataloader()) + self.loaders.precursor_tolerance = precursor_tolerance + self.loaders.isotope_error = isotope_error + + t1 = time.time() + self.trainer.predict(self.model, self.loaders.db_dataloader()) + t2 = time.time() + logger.info("Database search took %.3f seconds", t2 - t1) + logger.info("Scored %s PSMs", self.model.total_psms) + logger.info("%.3f PSMs per second", self.model.total_psms / (t2 - t1)) + logger.info("%s seconds per PSM", (t2 - t1) / self.model.total_psms) def train( self, From 35ba7d497cbc0c044ca5e13fd8e6e09162f77590 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 4 Jul 2024 21:50:44 +0000 Subject: [PATCH 26/84] Generate new screengrabs with rich-codex --- docs/images/configure-help.svg | 154 +++++++++++++++------- docs/images/evaluate-help.svg | 182 ++++++++++++++------------ docs/images/help.svg | 224 ++++++++++++++------------------ docs/images/sequence-help.svg | 182 ++++++++++++++------------ docs/images/train-help.svg | 228 ++++++++++++++------------------- 5 files changed, 493 insertions(+), 477 deletions(-) diff --git a/docs/images/configure-help.svg b/docs/images/configure-help.svg index 0822927a..4092bce3 100644 --- a/docs/images/configure-help.svg +++ b/docs/images/configure-help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - - $ casanovo configure --help - - Usage: casanovo configure [OPTIONS]                                             - - Generate a Casanovo configuration file to customize.                            - The casanovo configuration file is in the YAML format.                          - -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --output  -o  FILE  The output configuration file.                           │ -│ --help    -h        Show this message and exit.                              │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo configure --help +Traceback (most recent call last): +  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> +    from casanovo.casanovo import main +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> +    import depthcharge +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> +    from . import components +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> +    from .transformers import SpectrumEncoder, PeptideDecoder +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> +    from .. import utils +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> +    from tensorboard.backend.event_processing.event_accumulator import ( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> +    from tensorboard.backend.event_processing import event_file_loader +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> +    from tensorboard import dataclass_compat +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> +    from tensorboard.plugins.hparams import metadata as hparams_metadata +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> +    NULL_TENSOR = tensor_util.make_tensor_proto( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto +    numpy_dtype = dtypes.as_dtype(nparray.dtype) +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype +    if type_value.type == np.string_ or type_value.type == np.unicode_: +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ +    raise AttributeError( +AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? diff --git a/docs/images/evaluate-help.svg b/docs/images/evaluate-help.svg index b16c4ffd..d86b2497 100644 --- a/docs/images/evaluate-help.svg +++ b/docs/images/evaluate-help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + - + - + - - $ casanovo evaluate --help - - Usage: casanovo evaluate [OPTIONS] ANNOTATED_PEAK_PATH...                       - - Evaluate de novo peptide sequencing performance.                                - ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       - provided by MassIVE-KB.                                                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ *  ANNOTATED_PEAK_PATH    FILE  [required]                                   │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --model      -m  FILE                        The model weights (.ckpt file). │ -│                                              If not provided, Casanovo will  │ -│                                              try to download the latest      │ -│                                              release.                        │ -│ --output     -o  FILE                        The mzTab file to which results │ -│                                              will be written.                │ -│ --config     -c  FILE                        The YAML configuration file     │ -│                                              overriding the default options. │ -│ --verbosity  -v  [debug|info|warning|error]  Set the verbosity of console    │ -│                                              logging messages. Log files are │ -│                                              always set to 'debug'.          │ -│ --help       -h                              Show this message and exit.     │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo evaluate --help +Traceback (most recent call last): +  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> +    from casanovo.casanovo import main +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> +    import depthcharge +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> +    from . import components +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> +    from .transformers import SpectrumEncoder, PeptideDecoder +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> +    from .. import utils +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> +    from tensorboard.backend.event_processing.event_accumulator import ( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> +    from tensorboard.backend.event_processing import event_file_loader +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> +    from tensorboard import dataclass_compat +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> +    from tensorboard.plugins.hparams import metadata as hparams_metadata +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> +    NULL_TENSOR = tensor_util.make_tensor_proto( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto +    numpy_dtype = dtypes.as_dtype(nparray.dtype) +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype +    if type_value.type == np.string_ or type_value.type == np.unicode_: +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ +    raise AttributeError( +AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? diff --git a/docs/images/help.svg b/docs/images/help.svg index 67dca83e..dfb1039c 100644 --- a/docs/images/help.svg +++ b/docs/images/help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - + - + - - $ casanovo --help - - Usage: casanovo [OPTIONS] COMMAND [ARGS]...                                     - - ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  - ┃                                  Casanovo                                  ┃  - ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  - Casanovo de novo sequences peptides from tandem mass spectra using a            - Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   - de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  - training new models.                                                            - - Links:                                                                          - -  • Documentation: https://casanovo.readthedocs.io                               -  • Official code repository: https://github.com/Noble-Lab/casanovo              - - If you use Casanovo in your work, please cite:                                  - -  • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   -    mass spectrometry peptide sequencing with a transformer model. Proceedings   -    of the 39th International Conference on Machine Learning - ICML '22 (2022)   -    doi:10.1101/2022.02.07.479481.                                               - -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --help  -h    Show this message and exit.                                    │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Commands ───────────────────────────────────────────────────────────────────╮ -│ annotate   Annotate a given .mgf with candidates as selected by a Tide       │ -│            search for Casanovo-DB.                                           │ -│ configure  Generate a Casanovo configuration file to customize.              │ -│ db-search  Perform a search using Casanovo-DB.                               │ -│ evaluate   Evaluate de novo peptide sequencing performance.                  │ -│ sequence   De novo sequence peptides from tandem mass spectra.               │ -│ train      Train a Casanovo model on your own data.                          │ -│ version    Get the Casanovo version information                              │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo --help +Traceback (most recent call last): +  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> +    from casanovo.casanovo import main +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> +    import depthcharge +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> +    from . import components +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> +    from .transformers import SpectrumEncoder, PeptideDecoder +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> +    from .. import utils +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> +    from tensorboard.backend.event_processing.event_accumulator import ( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> +    from tensorboard.backend.event_processing import event_file_loader +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> +    from tensorboard import dataclass_compat +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> +    from tensorboard.plugins.hparams import metadata as hparams_metadata +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> +    NULL_TENSOR = tensor_util.make_tensor_proto( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto +    numpy_dtype = dtypes.as_dtype(nparray.dtype) +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype +    if type_value.type == np.string_ or type_value.type == np.unicode_: +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ +    raise AttributeError( +AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg index f5799766..b9b96d74 100644 --- a/docs/images/sequence-help.svg +++ b/docs/images/sequence-help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + - + - + - - $ casanovo sequence --help - - Usage: casanovo sequence [OPTIONS] PEAK_PATH...                                 - - De novo sequence peptides from tandem mass spectra.                             - PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence  - peptides.                                                                       - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ *  PEAK_PATH    FILE  [required]                                             │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --model      -m  FILE                        The model weights (.ckpt file). │ -│                                              If not provided, Casanovo will  │ -│                                              try to download the latest      │ -│                                              release.                        │ -│ --output     -o  FILE                        The mzTab file to which results │ -│                                              will be written.                │ -│ --config     -c  FILE                        The YAML configuration file     │ -│                                              overriding the default options. │ -│ --verbosity  -v  [debug|info|warning|error]  Set the verbosity of console    │ -│                                              logging messages. Log files are │ -│                                              always set to 'debug'.          │ -│ --help       -h                              Show this message and exit.     │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo sequence --help +Traceback (most recent call last): +  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> +    from casanovo.casanovo import main +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> +    import depthcharge +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> +    from . import components +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> +    from .transformers import SpectrumEncoder, PeptideDecoder +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> +    from .. import utils +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> +    from tensorboard.backend.event_processing.event_accumulator import ( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> +    from tensorboard.backend.event_processing import event_file_loader +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> +    from tensorboard import dataclass_compat +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> +    from tensorboard.plugins.hparams import metadata as hparams_metadata +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> +    NULL_TENSOR = tensor_util.make_tensor_proto( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto +    numpy_dtype = dtypes.as_dtype(nparray.dtype) +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype +    if type_value.type == np.string_ or type_value.type == np.unicode_: +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ +    raise AttributeError( +AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? diff --git a/docs/images/train-help.svg b/docs/images/train-help.svg index fccd4140..a71b8915 100644 --- a/docs/images/train-help.svg +++ b/docs/images/train-help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - + - + - - $ casanovo train --help - - Usage: casanovo train [OPTIONS] TRAIN_PEAK_PATH...                              - - Train a Casanovo model on your own data.                                        - TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  - by MassIVE-KB, from which to train a new Casnovo model.                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ *  TRAIN_PEAK_PATH    FILE  [required]                                       │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ *  --validation_peak_pa…  -p  FILE                    An annotated MGF file  │ -│                                                       for validation, like   │ -│                                                       from MassIVE-KB. Use   │ -│                                                       this option multiple   │ -│                                                       times to specify       │ -│                                                       multiple files.        │ -│                                                       [required]             │ -│    --model                -m  FILE                    The model weights      │ -│                                                       (.ckpt file). If not   │ -│                                                       provided, Casanovo     │ -│                                                       will try to download   │ -│                                                       the latest release.    │ -│    --output               -o  FILE                    The mzTab file to      │ -│                                                       which results will be  │ -│                                                       written.               │ -│    --config               -c  FILE                    The YAML configuration │ -│                                                       file overriding the    │ -│                                                       default options.       │ -│    --verbosity            -v  [debug|info|warning|er  Set the verbosity of   │ -│                               ror]                    console logging        │ -│                                                       messages. Log files    │ -│                                                       are always set to      │ -│                                                       'debug'.               │ -│    --help                 -h                          Show this message and  │ -│                                                       exit.                  │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo train --help +Traceback (most recent call last): +  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> +    from casanovo.casanovo import main +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> +    import depthcharge +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> +    from . import components +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> +    from .transformers import SpectrumEncoder, PeptideDecoder +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> +    from .. import utils +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> +    from tensorboard.backend.event_processing.event_accumulator import ( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> +    from tensorboard.backend.event_processing import event_file_loader +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> +    from tensorboard import dataclass_compat +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> +    from tensorboard.plugins.hparams import metadata as hparams_metadata +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> +    NULL_TENSOR = tensor_util.make_tensor_proto( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto +    numpy_dtype = dtypes.as_dtype(nparray.dtype) +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype +    if type_value.type == np.string_ or type_value.type == np.unicode_: +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ +    raise AttributeError( +AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? From f8a1a8964f929b793cd58844072d76656b4ac0f1 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Mon, 8 Jul 2024 12:14:52 -0700 Subject: [PATCH 27/84] fix batching issues --- casanovo/denovo/model.py | 71 ++++++++++++++++++--------------- casanovo/denovo/model_runner.py | 1 + 2 files changed, 40 insertions(+), 32 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 2256946c..3a069dcd 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1009,6 +1009,7 @@ class DbSpec2Pep(Spec2Pep): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.total_psms = 0 + self.psm_batch_size = 1024 def predict_step(self, batch, *args): """ @@ -1028,39 +1029,45 @@ def predict_step(self, batch, *args): scores, amino acid-level scores, and associated proteins. """ predictions = [] - pred, truth = self.decoder(batch[3], batch[1], *self.encoder(batch[0])) - pred = self.softmax(pred) - all_scores, per_aa_scores = _calc_match_score( - pred, truth, self.decoder.reverse - ) - for ( - precursor_charge, - precursor_mz, - spectrum_i, - peptide_score, - aa_scores, - peptide, - protein, - ) in zip( - batch[1][:, 1].cpu().detach().numpy(), - batch[1][:, 2].cpu().detach().numpy(), - batch[2], - all_scores.cpu().detach().numpy(), - per_aa_scores.cpu().detach().numpy(), - batch[3], - batch[4], - ): - predictions.append( - ( - spectrum_i, - precursor_charge, - precursor_mz, - peptide, - peptide_score, - aa_scores, - protein, - ) + while len(batch[0]) > 0: + next_batch = [b[self.psm_batch_size :] for b in batch] + batch = [b[: self.psm_batch_size] for b in batch] + pred, truth = self.decoder( + batch[3], batch[1], *self.encoder(batch[0]) ) + pred = self.softmax(pred) + all_scores, per_aa_scores = _calc_match_score( + pred, truth, self.decoder.reverse + ) + for ( + precursor_charge, + precursor_mz, + spectrum_i, + peptide_score, + aa_scores, + peptide, + protein, + ) in zip( + batch[1][:, 1].cpu().detach().numpy(), + batch[1][:, 2].cpu().detach().numpy(), + batch[2], + all_scores.cpu().detach().numpy(), + per_aa_scores.cpu().detach().numpy(), + batch[3], + batch[4], + ): + predictions.append( + ( + spectrum_i, + precursor_charge, + precursor_mz, + peptide, + peptide_score, + aa_scores, + protein, + ) + ) + batch = next_batch self.total_psms += len(predictions) return predictions diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 3286f4b8..a6b59ed9 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -135,6 +135,7 @@ def db_search( self.initialize_trainer(train=True) self.initialize_model(train=False, db_search=True) self.model.out_writer = self.writer + self.model.psm_batch_size = self.config.predict_batch_size test_index = self._get_index(peak_path, False, "db search") self.writer.set_ms_run(test_index.ms_files) From 7cb8e141ccab5b865a3af00711d290cd6cab788d Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Mon, 12 Aug 2024 14:50:18 -0700 Subject: [PATCH 28/84] small fixes regarding documentation, import syntax, etc. --- casanovo/casanovo.py | 39 ++++++---- casanovo/data/db_utils.py | 71 +++++++++-------- casanovo/denovo/dataloaders.py | 10 +-- casanovo/denovo/model.py | 31 ++++---- casanovo/denovo/model_runner.py | 24 ++---- tests/conftest.py | 11 +-- tests/unit_tests/test_unit.py | 132 +++++++++++--------------------- 7 files changed, 137 insertions(+), 181 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 8ae9a81b..4b9b4e38 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -130,7 +130,7 @@ def sequence( ) -> None: """De novo sequence peptides from tandem mass spectra. - PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which + PEAK_PATH must be one or more mzML, mzXML, or MGF files from which to sequence peptides. """ output = setup_logging(output, verbosity) @@ -205,7 +205,7 @@ def sequence( ) @click.option( "--digestion", - help="Digestion: full, partial", + help="Full: standard digestion. Semi: Include products of semi-specific cleavage", type=click.Choice( ["full", "partial"], case_sensitive=False, @@ -214,37 +214,41 @@ def sequence( ) @click.option( "--missed_cleavages", - help="Number of allowed missed cleavages", + help="Number of allowed missed cleavages when digesting protein", type=int, default=0, ) @click.option( "--max_mods", - help="Maximum number of modifications per peptide", + help="Maximum number of amino acid modifications per peptide", type=int, default=0, ) @click.option( - "--min_length", - help="Minimum peptide length", + "--min_peptide_length", + help="Minimum peptide length to consider", type=int, default=6, ) @click.option( - "--max_length", - help="Maximum peptide length", + "--max_peptide_length", + help="Maximum peptide length to consider", type=int, default=50, ) @click.option( "--precursor_tolerance", - help="Precursor tolerance window size (ppm)", - type=int, + help="Precursor tolerance window size (units: ppm)", + type=float, default=20, ) @click.option( "--isotope_error", - help="Isotope error levels to consider (list of ints, e.g: 1,2)", + help="Isotope error levels to consider. \ + Creates multiple mass windows to consider per spectrum \ + to account for observed mass not matching monoisotopic mass \ + due to the instrument assigning the 13C isotope \ + peak as the precursor (list of ints, e.g: 1,2)", type=str, default="0", ) @@ -255,9 +259,9 @@ def db_search( digestion: str, missed_cleavages: int, max_mods: int, - min_length: int, - max_length: int, - precursor_tolerance: int, + min_peptide_length: int, + max_peptide_length: int, + precursor_tolerance: float, isotope_error: str, model: Optional[str], config: Optional[str], @@ -266,7 +270,8 @@ def db_search( ) -> None: """Perform a database search on MS/MS data using Casanovo-DB. - PEAK_PATH must be one MGF file. FASTA_PATH must be one FASTA file. + PEAK_PATH must be one or more mzML, mzXML, or MGF files. + FASTA_PATH must be one FASTA file. """ output = setup_logging(output, verbosity) config, model = setup_model(model, config, output, False) @@ -284,8 +289,8 @@ def db_search( digestion, missed_cleavages, max_mods, - min_length, - max_length, + min_peptide_length, + max_peptide_length, precursor_tolerance, isotope_error, output, diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 921c75bd..1af09a47 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -1,15 +1,16 @@ """Unique methods used within db-search mode""" -import os -import depthcharge.masses -from pyteomics import fasta, parser import bisect import logging - +import os from typing import List, Tuple +import depthcharge.masses +from pyteomics import fasta, parser + logger = logging.getLogger("casanovo") + # CONSTANTS HYDROGEN = 1.007825035 OXYGEN = 15.99491463 @@ -51,8 +52,8 @@ def digest_fasta( digestion: str, missed_cleavages: int, max_mods: int, - min_length: int, - max_length: int, + min_peptide_length: int, + max_peptide_length: int, ): """ Digests a FASTA file and returns the peptides, their masses, and associated protein. @@ -70,9 +71,9 @@ def digest_fasta( The number of missed cleavages to allow. max_mods : int The maximum number of modifications to allow per peptide. - min_length : int + min_peptide_length : int The minimum length of peptides to consider. - max_length : int + max_peptide_length : int The maximum length of peptides to consider. Returns @@ -81,35 +82,36 @@ def digest_fasta( A list of tuples containing the peptide sequence, mass, and associated protein. Sorted by neutral mass in ascending order. """ - - # Verify the eistence of the file: + # Verify the existence of the file: if not os.path.isfile(fasta_filename): - print(f"File {fasta_filename} does not exist.") + logger.error("File %s does not exist.", fasta_filename) raise FileNotFoundError(f"File {fasta_filename} does not exist.") fasta_data = fasta.read(fasta_filename) peptide_list = [] - if digestion in ["full", "partial"]: - semi = True if digestion == "partial" else False - for header, seq in fasta_data: - pep_set = parser.cleave( - seq, - rule=parser.expasy_rules[enzyme], - missed_cleavages=missed_cleavages, - semi=semi, - ) - protein = header.split()[0] - for pep in pep_set: - if len(pep) < min_length or len(pep) > max_length: - continue - if "X" in pep or "U" in pep: - logger.warn( - "Skipping peptide with ambiguous amino acids: %s", pep - ) - continue - peptide_list.append((pep, protein)) - else: + if digestion not in ["full", "partial"]: + logger.error("Digestion type %s not recognized.", digestion) raise ValueError(f"Digestion type {digestion} not recognized.") + semi = digestion == "partial" + for header, seq in fasta_data: + pep_set = parser.cleave( + seq, + rule=parser.expasy_rules[enzyme], + missed_cleavages=missed_cleavages, + semi=semi, + ) + protein = header.split()[0] + for pep in pep_set: + if len(pep) < min_peptide_length or len(pep) > max_peptide_length: + continue + if any( + aa in pep for aa in "BJOUXZ" + ): # Check for incorrect AA letters + logger.warn( + "Skipping peptide with ambiguous amino acids: %s", pep + ) + continue + peptide_list.append((pep, protein)) # Generate modified peptides mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") @@ -136,7 +138,7 @@ def get_candidates( precursor_mz: float, charge: int, peptide_list: List[Tuple[str, float, str]], - precursor_tolerance: int, + precursor_tolerance: float, isotope_error: str, ): """ @@ -156,7 +158,6 @@ def get_candidates( isotope_error : str The isotope error levels to consider. """ - candidates = set() isotope_error = [int(x) for x in isotope_error.split(",")] @@ -219,7 +220,9 @@ def _to_raw_mass(mz_mass, charge): def get_mass_indices(masses, m_low, m_high): - """Grabs mass indices from a list of mass values that fall within a specified range. + """Grabs mass indices that fall within a specified range. + + Pulls from masses, a list of mass values. Requires that the mass values are sorted in ascending order. Parameters diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 80a4f7dc..14a0ff99 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -2,20 +2,20 @@ import functools import os -from typing import List, Optional, Tuple -from functools import partial import logging +from typing import List, Optional, Tuple +from depthcharge.data import AnnotatedSpectrumIndex import lightning.pytorch as pl import numpy as np import torch -from depthcharge.data import AnnotatedSpectrumIndex +from ..data import db_utils from ..data.datasets import ( AnnotatedSpectrumDataset, SpectrumDataset, ) -from ..data import db_utils + logger = logging.getLogger("casanovo") @@ -186,7 +186,7 @@ def db_dataloader(self) -> torch.utils.data.DataLoader: return torch.utils.data.DataLoader( self.test_dataset, batch_size=self.eval_batch_size, - collate_fn=partial( + collate_fn=functools.partial( prepare_psm_batch, digest=self.digest, precursor_tolerance=self.precursor_tolerance, diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 3a069dcd..79848682 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -16,7 +16,7 @@ from . import evaluate from .. import config -from ..data import ms_io, db_utils +from ..data import ms_io logger = logging.getLogger("casanovo") @@ -991,7 +991,8 @@ def configure_optimizers( class DbSpec2Pep(Spec2Pep): """ - Subclass of Spec2Pep for the use of Casanovo as an MS/MS database search score function. + Subclass of Spec2Pep for the use of Casanovo as an \ + MS/MS database search score function. Uses teacher forcing to 'query' Casanovo for its score for each AA within a candidate peptide, and takes the geometric average of these scores @@ -1008,7 +1009,6 @@ class DbSpec2Pep(Spec2Pep): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.total_psms = 0 self.psm_batch_size = 1024 def predict_step(self, batch, *args): @@ -1029,11 +1029,14 @@ def predict_step(self, batch, *args): scores, amino acid-level scores, and associated proteins. """ predictions = [] - while len(batch[0]) > 0: - next_batch = [b[self.psm_batch_size :] for b in batch] - batch = [b[: self.psm_batch_size] for b in batch] + for start_idx in range(0, len(batch[0]), self.psm_batch_size): + current_batch = [ + b[start_idx : start_idx + self.psm_batch_size] for b in batch + ] pred, truth = self.decoder( - batch[3], batch[1], *self.encoder(batch[0]) + current_batch[3], + current_batch[1], + *self.encoder(current_batch[0]), ) pred = self.softmax(pred) all_scores, per_aa_scores = _calc_match_score( @@ -1048,13 +1051,13 @@ def predict_step(self, batch, *args): peptide, protein, ) in zip( - batch[1][:, 1].cpu().detach().numpy(), - batch[1][:, 2].cpu().detach().numpy(), - batch[2], + current_batch[1][:, 1].cpu().detach().numpy(), + current_batch[1][:, 2].cpu().detach().numpy(), + current_batch[2], all_scores.cpu().detach().numpy(), per_aa_scores.cpu().detach().numpy(), - batch[3], - batch[4], + current_batch[3], + current_batch[4], ): predictions.append( ( @@ -1067,8 +1070,6 @@ def predict_step(self, batch, *args): protein, ) ) - batch = next_batch - self.total_psms += len(predictions) return predictions def on_predict_batch_end( @@ -1088,8 +1089,6 @@ def on_predict_batch_end( aa_scores, protein, ) in outputs: - if len(peptide) == 0: - continue self.out_writer.psms.append( ( peptide, diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index a6b59ed9..c2b71098 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -10,8 +10,6 @@ from pathlib import Path from typing import Iterable, List, Optional, Union -import time - import lightning.pytorch as pl import numpy as np import torch @@ -20,7 +18,7 @@ from lightning.pytorch.callbacks import ModelCheckpoint from ..config import Config -from ..data import ms_io, db_utils +from ..data import db_utils, ms_io from ..denovo.dataloaders import DeNovoDataModule from ..denovo.model import Spec2Pep, DbSpec2Pep @@ -89,8 +87,8 @@ def db_search( digestion: str, missed_cleavages: int, max_mods: int, - min_length: int, - max_length: int, + min_peptide_length: int, + max_peptide_length: int, precursor_tolerance: float, isotope_error: str, output: str, @@ -100,7 +98,7 @@ def db_search( Parameters ---------- peak_path : Iterable[str] - The path to the .mgf data file for database search. + The paths to the .mgf data files for database search. fasta_path : str The path to the FASTA file for database search. enzyme : str @@ -111,9 +109,9 @@ def db_search( The number of missed cleavages allowed. max_mods : int The maximum number of modifications allowed per peptide. - min_length : int + min_peptide_length : int The minimum peptide length. - max_length : int + max_peptide_length : int The maximum peptide length. precursor_tolerance : float The precursor mass tolerance in ppm. @@ -147,19 +145,13 @@ def db_search( digestion, missed_cleavages, max_mods, - min_length, - max_length, + min_peptide_length, + max_peptide_length, ) self.loaders.precursor_tolerance = precursor_tolerance self.loaders.isotope_error = isotope_error - t1 = time.time() self.trainer.predict(self.model, self.loaders.db_dataloader()) - t2 = time.time() - logger.info("Database search took %.3f seconds", t2 - t1) - logger.info("Scored %s PSMs", self.model.total_psms) - logger.info("%.3f PSMs per second", self.model.total_psms / (t2 - t1)) - logger.info("%s seconds per PSM", (t2 - t1) / self.model.total_psms) def train( self, diff --git a/tests/conftest.py b/tests/conftest.py index b2244308..60afcd83 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,19 +17,16 @@ def mgf_small(tmp_path): @pytest.fixture -def tiny_fasta_file(tmp_path, fasta_raw_data): +def tiny_fasta_file(tmp_path): fasta_file = tmp_path / "tiny_fasta.fasta" with fasta_file.open("w+") as fasta_ref: - fasta_ref.write(fasta_raw_data) + fasta_ref.write( + ">foo\nMEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP" + ) return fasta_file -@pytest.fixture -def fasta_raw_data(): - return ">foo\nMEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP" - - @pytest.fixture def mgf_db_search(tmp_path): """An MGF file with 7 spectra and scan numbers, C+57.021 mass modification considered""" diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index e3707917..419cf3ef 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -2,6 +2,7 @@ import heapq import os import platform +import re import shutil import tempfile @@ -10,11 +11,10 @@ import numpy as np import pytest import torch -import re from casanovo import casanovo from casanovo import utils -from casanovo.data import ms_io, db_utils +from casanovo.data import db_utils, ms_io from casanovo.data.datasets import SpectrumDataset, AnnotatedSpectrumDataset from casanovo.denovo.evaluate import aa_match_batch, aa_match_metrics from casanovo.denovo.model import Spec2Pep, _aa_pep_score, _calc_match_score @@ -220,10 +220,7 @@ def test_calc_match_score(): assert np.sum(masked_per_aa_scores.numpy()[3]) == 3 -def test_digest_fasta_cleave(fasta_raw_data): - - with open("temp_fasta", "w") as file: - file.write(fasta_raw_data) +def test_digest_fasta_cleave(tiny_fasta_file): # No missed cleavages expected_normal = [ @@ -275,49 +272,24 @@ def test_digest_fasta_cleave(fasta_raw_data): "EIVMTQSPPTLSLSPGERVTLSC+57.021RASQSVSSSYLTWYQQKPGQAPR", "LLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", ] + for missed_cleavages, expected in zip( + (0, 1, 3), + (expected_normal, expected_1missedcleavage, expected_3missedcleavage), + ): + peptide_list = db_utils.digest_fasta( + fasta_filename=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=missed_cleavages, + max_mods=0, + min_peptide_length=6, + max_peptide_length=50, + ) + peptide_list = [x[0] for x in peptide_list] + assert peptide_list == expected - peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", - enzyme="trypsin", - digestion="full", - missed_cleavages=0, - max_mods=0, - min_length=6, - max_length=50, - ) - peptide_list = [x[0] for x in peptide_list] - assert peptide_list == expected_normal - - peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", - enzyme="trypsin", - digestion="full", - missed_cleavages=1, - max_mods=0, - min_length=6, - max_length=50, - ) - peptide_list = [x[0] for x in peptide_list] - assert peptide_list == expected_1missedcleavage - - peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", - enzyme="trypsin", - digestion="full", - missed_cleavages=3, - max_mods=0, - min_length=6, - max_length=50, - ) - peptide_list = [x[0] for x in peptide_list] - assert peptide_list == expected_3missedcleavage - - -def test_digest_fasta_mods(fasta_raw_data): - - with open("temp_fasta", "w") as file: - file.write(fasta_raw_data) +def test_digest_fasta_mods(tiny_fasta_file): # 1 modification allowed # fixed: C+57.02146 # variable: 1M+15.994915,1N+0.984016,1Q+0.984016 @@ -373,13 +345,13 @@ def test_digest_fasta_mods(fasta_raw_data): ] peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", + fasta_filename=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=0, max_mods=1, - min_length=6, - max_length=50, + min_peptide_length=6, + max_peptide_length=50, ) peptide_list = [x[0] for x in peptide_list] peptide_list = [ @@ -392,11 +364,7 @@ def test_digest_fasta_mods(fasta_raw_data): assert peptide_list == expected_1mod -def test_length_restrictions(fasta_raw_data): - - with open("temp_fasta", "w") as file: - file.write(fasta_raw_data) - +def test_length_restrictions(tiny_fasta_file): # length between 20 and 50 expected_long = [ "MEAPAQLLFLLLLWLPDTTR", @@ -408,35 +376,31 @@ def test_length_restrictions(fasta_raw_data): expected_short = ["ATSIPAR", "VTLSC+57.021R"] peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", + fasta_filename=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=0, max_mods=0, - min_length=20, - max_length=50, + min_peptide_length=20, + max_peptide_length=50, ) peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_long peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", + fasta_filename=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=0, max_mods=0, - min_length=6, - max_length=8, + min_peptide_length=6, + max_peptide_length=8, ) peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_short -def test_digest_fasta_enzyme(fasta_raw_data): - - with open("temp_fasta", "w") as file: - file.write(fasta_raw_data) - +def test_digest_fasta_enzyme(tiny_fasta_file): # arg-c enzyme expected_argc = [ "ATSIPAR", @@ -452,35 +416,31 @@ def test_digest_fasta_enzyme(fasta_raw_data): expected_aspn = ["DFAVYYC+57.021QQ", "DFTLTISSLQPE", "MEAPAQLLFLLLLWLP"] peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", + fasta_filename=str(tiny_fasta_file), enzyme="arg-c", digestion="full", missed_cleavages=0, max_mods=0, - min_length=6, - max_length=50, + min_peptide_length=6, + max_peptide_length=50, ) peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_argc peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", + fasta_filename=str(tiny_fasta_file), enzyme="asp-n", digestion="full", missed_cleavages=0, max_mods=0, - min_length=6, - max_length=50, + min_peptide_length=6, + max_peptide_length=50, ) peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_aspn -def test_get_candidates(fasta_raw_data): - - with open("temp_fasta", "w") as file: - file.write(fasta_raw_data) - +def test_get_candidates(tiny_fasta_file): # precursor_window is 10000 expected_smallwindow = ["LLIYGASTR"] @@ -491,13 +451,13 @@ def test_get_candidates(fasta_raw_data): expected_widewindow = ["ATSIPAR", "VTLSC+57.021R", "LLIYGASTR"] peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", + fasta_filename=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=1, max_mods=0, - min_length=6, - max_length=50, + min_peptide_length=6, + max_peptide_length=50, ) candidates = db_utils.get_candidates( @@ -511,13 +471,13 @@ def test_get_candidates(fasta_raw_data): assert expected_smallwindow == candidates peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", + fasta_filename=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=1, max_mods=0, - min_length=6, - max_length=50, + min_peptide_length=6, + max_peptide_length=50, ) candidates = db_utils.get_candidates( @@ -531,13 +491,13 @@ def test_get_candidates(fasta_raw_data): assert expected_midwindow == candidates peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", + fasta_filename=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=1, max_mods=0, - min_length=6, - max_length=50, + min_peptide_length=6, + max_peptide_length=50, ) candidates = db_utils.get_candidates( From b2f08ac307f50c4dabc458745cd79b3ec2058f35 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Mon, 19 Aug 2024 19:09:26 -0700 Subject: [PATCH 29/84] add proteindatabase --- casanovo/casanovo.py | 110 -------- casanovo/config.yaml | 36 ++- casanovo/data/datasets.py | 2 +- casanovo/data/db_utils.py | 442 +++++++++++++++++--------------- casanovo/denovo/dataloaders.py | 28 +- casanovo/denovo/model_runner.py | 45 +--- tests/conftest.py | 5 + tests/test_integration.py | 2 - tests/unit_tests/test_unit.py | 200 +++++++++------ 9 files changed, 404 insertions(+), 466 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 4b9b4e38..b153512d 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -158,111 +158,9 @@ def sequence( nargs=1, type=click.Path(exists=True, dir_okay=False), ) -@click.option( - "--enzyme", - help="Enzyme for in silico digestion, \ - See pyteomics.parser.expasy_rules for valid enzymes", - type=click.Choice( - [ - "arg-c", - "asp-n", - "bnps-skatole", - "caspase 1", - "caspase 2", - "caspase 3", - "caspase 4", - "caspase 5", - "caspase 6", - "caspase 7", - "caspase 8", - "caspase 9", - "caspase 10", - "chymotrypsin high specificity", - "chymotrypsin low specificity", - "clostripain", - "cnbr", - "enterokinase", - "factor xa", - "formic acid", - "glutamyl endopeptidase", - "granzyme b", - "hydroxylamine", - "iodosobenzoic acid", - "lysc", - "ntcb", - "pepsin ph1.3", - "pepsin ph2.0", - "proline endopeptidase", - "proteinase k", - "staphylococcal peptidase i", - "thermolysin", - "thrombin", - "trypsin", - "trypsin_exception", - ] - ), - default="trypsin", -) -@click.option( - "--digestion", - help="Full: standard digestion. Semi: Include products of semi-specific cleavage", - type=click.Choice( - ["full", "partial"], - case_sensitive=False, - ), - default="full", -) -@click.option( - "--missed_cleavages", - help="Number of allowed missed cleavages when digesting protein", - type=int, - default=0, -) -@click.option( - "--max_mods", - help="Maximum number of amino acid modifications per peptide", - type=int, - default=0, -) -@click.option( - "--min_peptide_length", - help="Minimum peptide length to consider", - type=int, - default=6, -) -@click.option( - "--max_peptide_length", - help="Maximum peptide length to consider", - type=int, - default=50, -) -@click.option( - "--precursor_tolerance", - help="Precursor tolerance window size (units: ppm)", - type=float, - default=20, -) -@click.option( - "--isotope_error", - help="Isotope error levels to consider. \ - Creates multiple mass windows to consider per spectrum \ - to account for observed mass not matching monoisotopic mass \ - due to the instrument assigning the 13C isotope \ - peak as the precursor (list of ints, e.g: 1,2)", - type=str, - default="0", -) def db_search( peak_path: Tuple[str], fasta_path: str, - enzyme: str, - digestion: str, - missed_cleavages: int, - max_mods: int, - min_peptide_length: int, - max_peptide_length: int, - precursor_tolerance: float, - isotope_error: str, model: Optional[str], config: Optional[str], output: Optional[str], @@ -285,14 +183,6 @@ def db_search( runner.db_search( peak_path, fasta_path, - enzyme, - digestion, - missed_cleavages, - max_mods, - min_peptide_length, - max_peptide_length, - precursor_tolerance, - isotope_error, output, ) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index c7186ff7..860cfabb 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -5,18 +5,26 @@ ### # The following parameters can be modified when running inference or when -# fine-tuning an existing Casanovo model. +# fine-tuning an existing Casanovo model. They also affect database search +# parameters when running Casanovo in DB-search mode. ### # Max absolute difference allowed with respect to observed precursor m/z. -# Predictions outside the tolerance range are assigned a negative peptide score. +# denovo: Predictions outside the tolerance range are assigned a negative peptide score. +# db-search: Used to create mas windows for candidate generation. precursor_mass_tol: 50 # ppm # Isotopes to consider when comparing predicted and observed precursor m/z's. isotope_error_range: [0, 1] -# The minimum length of predicted peptides. +# The minimum length of predicted/scored peptides. min_peptide_len: 6 -# Number of spectra in one inference batch. +# Number of spectra or psms in one inference batch. predict_batch_size: 1024 + + +### +# The following parameters are unique to Casanovo's inference/finetuning mode. +### + # Number of beams used in beam search. n_beams: 1 # Number of PSMs for each spectrum. @@ -29,6 +37,26 @@ accelerator: "auto" # number will be automatically selected for based on the chosen accelerator. devices: + +### +# The following parameters are unique to Casanovo's database search mode. +### + +# Enzyme for in silico digestion, used to generate candidate peptides. +# See pyteomics.parser.expasy_rules for valid enzymes +enzyme: "trypsin" +# Digestion type for candidate peptide generation. +# Full: standard digestion. Semi: Include products of semi-specific cleavage +digestion: "full" +# Number of allowed missed cleavages when digesting protein +missed_cleavages: 0 +# Maximum number of amino acid modifications per peptide. +# None generates all possible isoforms as candidates. +max_mods: +# Maximum peptide length to consider +max_peptide_len: 50 + + ### # The following parameters should only be modified if you are training a new # Casanovo model from scratch. diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py index 6244e88f..3f05811f 100644 --- a/casanovo/data/datasets.py +++ b/casanovo/data/datasets.py @@ -1,6 +1,6 @@ """A PyTorch Dataset class for annotated spectra.""" -from typing import Optional, Tuple +from typing import List, Optional, Tuple import depthcharge import numpy as np diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 1af09a47..a7b5e850 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -6,15 +6,12 @@ from typing import List, Tuple import depthcharge.masses +from numba import jit from pyteomics import fasta, parser logger = logging.getLogger("casanovo") - # CONSTANTS -HYDROGEN = 1.007825035 -OXYGEN = 15.99491463 -H2O = 2 * HYDROGEN + OXYGEN PROTON = 1.00727646677 ISOTOPE_SPACING = 1.003355 @@ -29,216 +26,243 @@ fixed_mods = {"carbm": ["C"]} -def convert_from_modx(seq: str): - """Converts peptide sequence from modX format to Casanovo-acceptable modifications. - - Args: - seq (str): Peptide in modX format - """ - seq = seq.replace("carbmC", "C+57.021") # Fixed modification - seq = seq.replace("oxM", "M+15.995") - seq = seq.replace("dN", "N+0.984") - seq = seq.replace("dQ", "Q+0.984") - seq = seq.replace("ace-", "+42.011") - seq = seq.replace("carbnh3x-", "+43.006-17.027") - seq = seq.replace("carb-", "+43.006") - seq = seq.replace("nh3x-", "-17.027") - return seq - - -def digest_fasta( - fasta_filename: str, - enzyme: str, - digestion: str, - missed_cleavages: int, - max_mods: int, - min_peptide_length: int, - max_peptide_length: int, -): - """ - Digests a FASTA file and returns the peptides, their masses, and associated protein. - - Parameters - ---------- - fasta_filename : str - Path to the FASTA file. - enzyme : str - The enzyme to use for digestion. - See pyteomics.parser.expasy_rules for valid enzymes. - digestion : str - The type of digestion to perform. Either 'full' or 'partial'. - missed_cleavages : int - The number of missed cleavages to allow. - max_mods : int - The maximum number of modifications to allow per peptide. - min_peptide_length : int - The minimum length of peptides to consider. - max_peptide_length : int - The maximum length of peptides to consider. - - Returns - ------- - mod_peptide_list : List[Tuple[str, float, str]] - A list of tuples containing the peptide sequence, mass, - and associated protein. Sorted by neutral mass in ascending order. - """ - # Verify the existence of the file: - if not os.path.isfile(fasta_filename): - logger.error("File %s does not exist.", fasta_filename) - raise FileNotFoundError(f"File {fasta_filename} does not exist.") - - fasta_data = fasta.read(fasta_filename) - peptide_list = [] - if digestion not in ["full", "partial"]: - logger.error("Digestion type %s not recognized.", digestion) - raise ValueError(f"Digestion type {digestion} not recognized.") - semi = digestion == "partial" - for header, seq in fasta_data: - pep_set = parser.cleave( - seq, - rule=parser.expasy_rules[enzyme], - missed_cleavages=missed_cleavages, - semi=semi, - ) - protein = header.split()[0] - for pep in pep_set: - if len(pep) < min_peptide_length or len(pep) > max_peptide_length: - continue - if any( - aa in pep for aa in "BJOUXZ" - ): # Check for incorrect AA letters - logger.warn( - "Skipping peptide with ambiguous amino acids: %s", pep - ) - continue - peptide_list.append((pep, protein)) - - # Generate modified peptides - mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") - mod_peptide_list = [] - for pep, prot in peptide_list: - peptide_isoforms = parser.isoforms( - pep, - variable_mods=var_mods, - fixed_mods=fixed_mods, - max_mods=max_mods, - ) - peptide_isoforms = list(map(convert_from_modx, peptide_isoforms)) - mod_peptide_list.extend( - (mod_pep, mass_calculator.mass(mod_pep), prot) - for mod_pep in peptide_isoforms - ) - - # Sort the peptides by mass and return. - mod_peptide_list.sort(key=lambda x: x[1]) - return mod_peptide_list - - -def get_candidates( - precursor_mz: float, - charge: int, - peptide_list: List[Tuple[str, float, str]], - precursor_tolerance: float, - isotope_error: str, -): +class ProteinDatabase: """ - Returns a list of candidate peptides that fall within the specified mass range. + TODO Parameters ---------- - precursor_mz : float - The precursor mass-to-charge ratio. - charge : int - The precursor charge. - peptide_list : List[Tuple[str, float, str]] - A list of tuples containing the peptide sequence, mass, and associated protein. - Must be sorted by mass in ascending order. Uses neutral masses. - precursor_tolerance : float - The precursor mass tolerance in parts-per-million. - isotope_error : str - The isotope error levels to consider. + TODO """ - candidates = set() - isotope_error = [int(x) for x in isotope_error.split(",")] - for e in isotope_error: - iso_shift = ISOTOPE_SPACING * e - upper_bound = (_to_raw_mass(precursor_mz, charge) - iso_shift) * ( - 1 + (precursor_tolerance / 1e6) - ) - lower_bound = (_to_raw_mass(precursor_mz, charge) - iso_shift) * ( - 1 - (precursor_tolerance / 1e6) + def __init__( + self, + fasta_path: str, + enzyme: str, + digestion: str, + missed_cleavages: int, + min_peptide_len: int, + max_peptide_len: int, + max_mods: int, + precursor_tolerance: float, + isotope_error: List[int], + ): + self.digest = self._digest_fasta( + fasta_path, + enzyme, + digestion, + missed_cleavages, + max_mods, + min_peptide_len, + max_peptide_len, ) - - start, end = get_mass_indices( - [x[1] for x in peptide_list], lower_bound, upper_bound + self.precursor_tolerance = precursor_tolerance + self.isotope_error = isotope_error + + def get_candidates( + self, + precursor_mz: float, + charge: int, + ): + """ + Returns a list of candidate peptides that fall within the specified mass range. + + Parameters + ---------- + precursor_mz : float + The precursor mass-to-charge ratio. + charge : int + The precursor charge. + """ + candidates = set() + + for e in self.isotope_error: + iso_shift = ISOTOPE_SPACING * e + upper_bound = ( + ProteinDatabase._to_raw_mass(precursor_mz, charge) - iso_shift + ) * (1 + (self.precursor_tolerance / 1e6)) + lower_bound = ( + ProteinDatabase._to_raw_mass(precursor_mz, charge) - iso_shift + ) * (1 - (self.precursor_tolerance / 1e6)) + + start, end = ProteinDatabase._get_mass_indices( + [x[1] for x in self.digest], lower_bound, upper_bound + ) + + candidates.update(self.digest[start:end]) + + candidates = list(candidates) + candidates.sort(key=lambda x: x[1]) + return candidates + + def _digest_fasta( + self, + fasta_filename: str, + enzyme: str, + digestion: str, + missed_cleavages: int, + max_mods: int, + min_peptide_length: int, + max_peptide_length: int, + ): + """ + Digests a FASTA file and returns the peptides, their masses, and associated protein. + + Parameters + ---------- + fasta_filename : str + Path to the FASTA file. + enzyme : str + The enzyme to use for digestion. + See pyteomics.parser.expasy_rules for valid enzymes. + digestion : str + The type of digestion to perform. Either 'full' or 'partial'. + missed_cleavages : int + The number of missed cleavages to allow. + max_mods : int + The maximum number of modifications to allow per peptide. + min_peptide_length : int + The minimum length of peptides to consider. + max_peptide_length : int + The maximum length of peptides to consider. + + Returns + ------- + mod_peptide_list : List[Tuple[str, float, str]] + A list of tuples containing the peptide sequence, mass, + and associated protein. Sorted by neutral mass in ascending order. + """ + # Verify the existence of the file: + if not os.path.isfile(fasta_filename): + logger.error("File %s does not exist.", fasta_filename) + raise FileNotFoundError(f"File {fasta_filename} does not exist.") + + fasta_data = fasta.read(fasta_filename) + peptide_list = [] + if digestion not in ["full", "partial"]: + logger.error("Digestion type %s not recognized.", digestion) + raise ValueError(f"Digestion type {digestion} not recognized.") + semi = digestion == "partial" + for header, seq in fasta_data: + pep_set = parser.cleave( + seq, + rule=parser.expasy_rules[enzyme], + missed_cleavages=missed_cleavages, + semi=semi, + ) + protein = header.split()[0] + for pep in pep_set: + if ( + len(pep) < min_peptide_length + or len(pep) > max_peptide_length + ): + continue + if any( + aa in pep for aa in "BJOUXZ" + ): # Check for incorrect AA letters + logger.warn( + "Skipping peptide with ambiguous amino acids: %s", pep + ) + continue + peptide_list.append((pep, protein)) + + # Generate modified peptides + mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") + mod_peptide_list = [] + for pep, prot in peptide_list: + peptide_isoforms = parser.isoforms( + pep, + variable_mods=var_mods, + fixed_mods=fixed_mods, + max_mods=max_mods, + ) + peptide_isoforms = list( + map(ProteinDatabase._convert_from_modx, peptide_isoforms) + ) + mod_peptide_list.extend( + (mod_pep, mass_calculator.mass(mod_pep), prot) + for mod_pep in peptide_isoforms + ) + + # Sort the peptides by mass and return. + mod_peptide_list.sort(key=lambda x: x[1]) + logger.info( + "Digestion complete. %d peptides generated.", len(mod_peptide_list) ) - - candidates.update(peptide_list[start:end]) - - candidates = list(candidates) - candidates.sort(key=lambda x: x[1]) - return candidates - - -def _to_mz(precursor_mass, charge): - """ - Convert precursor neutral mass to m/z value. - - Parameters - ---------- - precursor_mass : float - The precursor neutral mass. - charge : int - The precursor charge. - - Returns - ------- - mz : float - The calculated precursor mass-to-charge ratio. - """ - return (precursor_mass + (charge * PROTON)) / charge - - -def _to_raw_mass(mz_mass, charge): - """ - Convert precursor m/z value to neutral mass. - - Parameters - ---------- - mz_mass : float - The precursor mass-to-charge ratio. - charge : int - The precursor charge. - - Returns - ------- - mass : float - The calculated precursor neutral mass. - """ - return charge * (mz_mass - PROTON) - - -def get_mass_indices(masses, m_low, m_high): - """Grabs mass indices that fall within a specified range. - - Pulls from masses, a list of mass values. - Requires that the mass values are sorted in ascending order. - - Parameters - ---------- - masses : List[int] - List of mass values - m_low : int - Lower bound of mass range (inclusive) - m_high : int - Upper bound of mass range (inclusive) - - Return - ------ - indices : Tuple[int, int] - Indices of mass values that fall within the specified range - """ - start = bisect.bisect_left(masses, m_low) - end = bisect.bisect_right(masses, m_high) - return start, end + return mod_peptide_list + + def _to_mz(precursor_mass, charge): + """ + Convert precursor neutral mass to m/z value. + + Parameters + ---------- + precursor_mass : float + The precursor neutral mass. + charge : int + The precursor charge. + + Returns + ------- + mz : float + The calculated precursor mass-to-charge ratio. + """ + return (precursor_mass + (charge * PROTON)) / charge + + def _to_raw_mass(mz_mass, charge): + """ + Convert precursor m/z value to neutral mass. + + Parameters + ---------- + mz_mass : float + The precursor mass-to-charge ratio. + charge : int + The precursor charge. + + Returns + ------- + mass : float + The calculated precursor neutral mass. + """ + return charge * (mz_mass - PROTON) + + def _get_mass_indices(masses, m_low, m_high): + """Grabs mass indices that fall within a specified range. + + Pulls from masses, a list of mass values. + Requires that the mass values are sorted in ascending order. + + Parameters + ---------- + masses : List[int] + List of mass values + m_low : int + Lower bound of mass range (inclusive) + m_high : int + Upper bound of mass range (inclusive) + + Return + ------ + indices : Tuple[int, int] + Indices of mass values that fall within the specified range + """ + start = bisect.bisect_left(masses, m_low) + end = bisect.bisect_right(masses, m_high) + return start, end + + def _convert_from_modx(seq: str): + """Converts peptide sequence from modX format to Casanovo-acceptable modifications. + + Args: + seq (str): Peptide in modX format + """ + seq = seq.replace("carbmC", "C+57.021") # Fixed modification + seq = seq.replace("oxM", "M+15.995") + seq = seq.replace("dN", "N+0.984") + seq = seq.replace("dQ", "Q+0.984") + seq = seq.replace("ace-", "+42.011") + seq = seq.replace("carbnh3x-", "+43.006-17.027") + seq = seq.replace("carb-", "+43.006") + seq = seq.replace("nh3x-", "-17.027") + return seq diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 14a0ff99..4d5524f4 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -89,6 +89,7 @@ def __init__( self.train_dataset = None self.valid_dataset = None self.test_dataset = None + self.pdb = None def setup(self, stage: str = None, annotated: bool = True) -> None: """ @@ -96,7 +97,7 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: Parameters ---------- - stage : str {"fit", "validate", "test", "db"} + stage : str {"fit", "validate", "test"} The stage indicating which Datasets to prepare. All are prepared by default. annotated: bool @@ -186,12 +187,7 @@ def db_dataloader(self) -> torch.utils.data.DataLoader: return torch.utils.data.DataLoader( self.test_dataset, batch_size=self.eval_batch_size, - collate_fn=functools.partial( - prepare_psm_batch, - digest=self.digest, - precursor_tolerance=self.precursor_tolerance, - isotope_error=self.isotope_error, - ), + collate_fn=functools.partial(prepare_psm_batch, pdb=self.pdb), pin_memory=True, num_workers=self.n_workers, shuffle=False, @@ -239,9 +235,7 @@ def prepare_batch( def prepare_psm_batch( batch: List[Tuple[torch.Tensor, float, int, str]], - digest: List[Tuple[str, float, str]], - precursor_tolerance: float, - isotope_error: str, + pdb: db_utils.ProteinDatabase, ): """ Collate MS/MS spectra into a batch for DB search. @@ -255,13 +249,8 @@ def prepare_psm_batch( A batch of data from an AnnotatedSpectrumDataset, consisting of for each spectrum (i) a tensor with the m/z and intensity peak values, (ii), the precursor m/z, (iii) the precursor charge, (iv) the spectrum identifier. - digest : List[Tuple[str, float, str]] - A list of tuples containing the peptide sequence, mass, and associated protein - from digesting a .fasta file. Sorted by mass in ascending order. Uses neutral masses. - precursor_tolerance : float - The precursor mass tolerance in parts-per-million. - isotope_error : str - The isotope error levels to consider. + pdb : db_utils.ProteinDatabase + The protein database to use for candidate peptide retrieval. Returns ------- @@ -294,12 +283,9 @@ def prepare_psm_batch( all_peptides = [] all_proteins = [] for idx in range(len(batch)): - digest_data = db_utils.get_candidates( + digest_data = pdb.get_candidates( precursor_mzs[idx], precursor_charges[idx], - digest, - precursor_tolerance, - isotope_error, ) try: spec_peptides, _, pep_protein = list(zip(*digest_data)) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index c2b71098..b90f06b0 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -83,14 +83,6 @@ def db_search( self, peak_path: Iterable[str], fasta_path: str, - enzyme: str, - digestion: str, - missed_cleavages: int, - max_mods: int, - min_peptide_length: int, - max_peptide_length: int, - precursor_tolerance: float, - isotope_error: str, output: str, ) -> None: """Perform database search with Casanovo. @@ -101,22 +93,6 @@ def db_search( The paths to the .mgf data files for database search. fasta_path : str The path to the FASTA file for database search. - enzyme : str - The enzyme used for digestion. - digestion : str - The digestion type, full or partial. - missed_cleavages : int - The number of missed cleavages allowed. - max_mods : int - The maximum number of modifications allowed per peptide. - min_peptide_length : int - The minimum peptide length. - max_peptide_length : int - The maximum peptide length. - precursor_tolerance : float - The precursor mass tolerance in ppm. - isotope_error : str - Isotope error levels to consider, in comma-delineated string form. output : str Where should the output be saved? @@ -138,19 +114,18 @@ def db_search( self.writer.set_ms_run(test_index.ms_files) self.initialize_data_module(test_index=test_index) - self.loaders.setup(stage="test", annotated=False) - self.loaders.digest = db_utils.digest_fasta( + self.loaders.pdb = db_utils.ProteinDatabase( fasta_path, - enzyme, - digestion, - missed_cleavages, - max_mods, - min_peptide_length, - max_peptide_length, + self.config.enzyme, + self.config.digestion, + self.config.missed_cleavages, + self.config.min_peptide_len, + self.config.max_peptide_len, + self.config.max_mods, + self.config.precursor_mass_tol, + self.config.isotope_error_range, ) - self.loaders.precursor_tolerance = precursor_tolerance - self.loaders.isotope_error = isotope_error - + self.loaders.setup(stage="test", annotated=False) self.trainer.predict(self.model, self.loaders.db_dataloader()) def train( diff --git a/tests/conftest.py b/tests/conftest.py index 60afcd83..f20d7879 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -242,6 +242,11 @@ def tiny_config(tmp_path): "precursor_mass_tol": 5, "isotope_error_range": [0, 1], "min_peptide_len": 6, + "max_peptide_len": 50, + "enzyme": "trypsin", + "digestion": "full", + "missed_cleavages": 0, + "max_mods": None, "predict_batch_size": 1024, "n_beams": 1, "top_match": 1, diff --git a/tests/test_integration.py b/tests/test_integration.py index 4bd55174..61f735c3 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -24,8 +24,6 @@ def test_db_search( tiny_config, "--output", str(output_path), - "--precursor_tolerance", - str(100), str(mgf_db_search), str(tiny_fasta_file), ] diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 419cf3ef..7a37e771 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -276,15 +276,18 @@ def test_digest_fasta_cleave(tiny_fasta_file): (0, 1, 3), (expected_normal, expected_1missedcleavage, expected_3missedcleavage), ): - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=missed_cleavages, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected @@ -343,16 +346,18 @@ def test_digest_fasta_mods(tiny_fasta_file): "+42.011FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "+43.006FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", ] - - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=50, max_mods=1, - min_peptide_length=6, - max_peptide_length=50, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] peptide_list = [ x @@ -375,27 +380,33 @@ def test_length_restrictions(tiny_fasta_file): # length between 6 and 8 expected_short = ["ATSIPAR", "VTLSC+57.021R"] - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=0, + min_peptide_len=20, + max_peptide_len=50, max_mods=0, - min_peptide_length=20, - max_peptide_length=50, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_long - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=8, max_mods=0, - min_peptide_length=6, - max_peptide_length=8, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_short @@ -415,27 +426,33 @@ def test_digest_fasta_enzyme(tiny_fasta_file): # asp-n enzyme expected_aspn = ["DFAVYYC+57.021QQ", "DFTLTISSLQPE", "MEAPAQLLFLLLLWLP"] - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="arg-c", digestion="full", missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_argc - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="asp-n", digestion="full", missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_aspn @@ -450,68 +467,53 @@ def test_get_candidates(tiny_fasta_file): # precursor window is 600000 expected_widewindow = ["ATSIPAR", "VTLSC+57.021R", "LLIYGASTR"] - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=1, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, - ) - - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, precursor_tolerance=10000, - isotope_error="0", + isotope_error=[0], ) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_smallwindow == candidates - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=1, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, - ) - - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, precursor_tolerance=150000, - isotope_error="0", + isotope_error=[0], ) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_midwindow == candidates - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=1, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, - ) - - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, precursor_tolerance=600000, - isotope_error="0", + isotope_error=[0], ) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_widewindow == candidates -def test_get_candidates_isotope_error(): +def test_get_candidates_isotope_error(tiny_fasta_file): # Tide isotope error windows for 496.2, 2+: # 0: [980.481617, 1000.289326] @@ -556,53 +558,83 @@ def test_get_candidates_isotope_error(): expected_isotope3 = list("XWVUTSRQPONMLKJIHGFE") expected_isotope0123 = list("XWVUTSRQPONMLKJIHGFEDCB") - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, precursor_tolerance=10000, - isotope_error="0", + isotope_error=[0], ) + pdb.digest = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_isotope0 == candidates - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, precursor_tolerance=10000, - isotope_error="1", + isotope_error=[1], ) + pdb.digest = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_isotope1 == candidates - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, precursor_tolerance=10000, - isotope_error="2", + isotope_error=[2], ) + pdb.digest = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_isotope2 == candidates - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, precursor_tolerance=10000, - isotope_error="3", + isotope_error=[3], ) + pdb.digest = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_isotope3 == candidates - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, precursor_tolerance=10000, - isotope_error="0,1,2,3", + isotope_error=[0, 1, 2, 3], ) + pdb.digest = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_isotope0123 == candidates From 3d0b0b9b6f3c4efedd7034aab4ecc62de2a9a4ca Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 20 Aug 2024 02:12:46 +0000 Subject: [PATCH 30/84] Generate new screengrabs with rich-codex --- docs/images/configure-help.svg | 160 +++++++--------------- docs/images/evaluate-help.svg | 191 +++++++++++++------------- docs/images/help.svg | 223 ++++++++++++++++++------------- docs/images/sequence-help.svg | 191 +++++++++++++------------- docs/images/train-help.svg | 237 ++++++++++++++++++++------------- 5 files changed, 509 insertions(+), 493 deletions(-) diff --git a/docs/images/configure-help.svg b/docs/images/configure-help.svg index 4092bce3..b1fcce10 100644 --- a/docs/images/configure-help.svg +++ b/docs/images/configure-help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - + - + - - $ casanovo configure --help -Traceback (most recent call last): -  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> -    from casanovo.casanovo import main -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> -    import depthcharge -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> -    from . import components -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> -    from .transformers import SpectrumEncoder, PeptideDecoder -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> -    from .. import utils -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> -    from tensorboard.backend.event_processing.event_accumulator import ( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> -    from tensorboard.backend.event_processing import event_file_loader -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> -    from tensorboard import dataclass_compat -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> -    from tensorboard.plugins.hparams import metadata as hparams_metadata -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> -    NULL_TENSOR = tensor_util.make_tensor_proto( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto -    numpy_dtype = dtypes.as_dtype(nparray.dtype) -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype -    if type_value.type == np.string_ or type_value.type == np.unicode_: -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ -    raise AttributeError( -AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? + + $ casanovo configure --help + +Usage:casanovo configure [OPTIONS]                                             + + Generate a Casanovo configuration file to customize.                            + The casanovo configuration file is in the YAML format.                          + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--output-oFILE  The output configuration file.                            +--help-h  Show this message and exit.                               +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/evaluate-help.svg b/docs/images/evaluate-help.svg index d86b2497..2f770e2e 100644 --- a/docs/images/evaluate-help.svg +++ b/docs/images/evaluate-help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - + + - + - + - - $ casanovo evaluate --help -Traceback (most recent call last): -  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> -    from casanovo.casanovo import main -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> -    import depthcharge -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> -    from . import components -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> -    from .transformers import SpectrumEncoder, PeptideDecoder -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> -    from .. import utils -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> -    from tensorboard.backend.event_processing.event_accumulator import ( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> -    from tensorboard.backend.event_processing import event_file_loader -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> -    from tensorboard import dataclass_compat -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> -    from tensorboard.plugins.hparams import metadata as hparams_metadata -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> -    NULL_TENSOR = tensor_util.make_tensor_proto( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto -    numpy_dtype = dtypes.as_dtype(nparray.dtype) -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype -    if type_value.type == np.string_ or type_value.type == np.unicode_: -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ -    raise AttributeError( -AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? + + $ casanovo evaluate --help + +Usage:casanovo evaluate [OPTIONSANNOTATED_PEAK_PATH...                       + + Evaluate de novo peptide sequencing performance.                                + ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       + provided by MassIVE-KB.                                                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  ANNOTATED_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--model-mFILE                        The model weights (.ckpt file).  +                                              If not provided, Casanovo will   +                                              try to download the latest       +                                              release.                         +--output-oFILE                        The mzTab file to which results  +                                              will be written.                 +--config-cFILE                        The YAML configuration file      +                                              overriding the default options.  +--verbosity-v[debug|info|warning|error]  Set the verbosity of console     +                                              logging messages. Log files are  +                                              always set to 'debug'.           +--help-h  Show this message and exit.      +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/help.svg b/docs/images/help.svg index dfb1039c..6243538a 100644 --- a/docs/images/help.svg +++ b/docs/images/help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + - + - + - - $ casanovo --help -Traceback (most recent call last): -  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> -    from casanovo.casanovo import main -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> -    import depthcharge -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> -    from . import components -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> -    from .transformers import SpectrumEncoder, PeptideDecoder -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> -    from .. import utils -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> -    from tensorboard.backend.event_processing.event_accumulator import ( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> -    from tensorboard.backend.event_processing import event_file_loader -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> -    from tensorboard import dataclass_compat -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> -    from tensorboard.plugins.hparams import metadata as hparams_metadata -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> -    NULL_TENSOR = tensor_util.make_tensor_proto( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto -    numpy_dtype = dtypes.as_dtype(nparray.dtype) -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype -    if type_value.type == np.string_ or type_value.type == np.unicode_: -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ -    raise AttributeError( -AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? + + $ casanovo --help + +Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     + + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  + ┃                                  Casanovo                                  ┃  + ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  + Casanovo de novo sequences peptides from tandem mass spectra using a            + Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   + de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  + training new models.                                                            + + Links:                                                                          + + • Documentation: https://casanovo.readthedocs.io + • Official code repository: https://github.com/Noble-Lab/casanovo + + If you use Casanovo in your work, please cite:                                  + + • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   +mass spectrometry peptide sequencing with a transformer model. Proceedings   +of the 39th International Conference on Machine Learning - ICML '22 (2022)   +doi:10.1101/2022.02.07.479481.                                               + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--help-h    Show this message and exit.                                     +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Commands ───────────────────────────────────────────────────────────────────╮ +configure Generate a Casanovo configuration file to customize.               +db-search Perform a database search on MS/MS data using Casanovo-DB.         +evaluate  Evaluate de novo peptide sequencing performance.                   +sequence  De novo sequence peptides from tandem mass spectra.                +train     Train a Casanovo model on your own data.                           +version   Get the Casanovo version information                               +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg index b9b96d74..7a1bbff6 100644 --- a/docs/images/sequence-help.svg +++ b/docs/images/sequence-help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - + + - + - + - - $ casanovo sequence --help -Traceback (most recent call last): -  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> -    from casanovo.casanovo import main -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> -    import depthcharge -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> -    from . import components -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> -    from .transformers import SpectrumEncoder, PeptideDecoder -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> -    from .. import utils -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> -    from tensorboard.backend.event_processing.event_accumulator import ( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> -    from tensorboard.backend.event_processing import event_file_loader -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> -    from tensorboard import dataclass_compat -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> -    from tensorboard.plugins.hparams import metadata as hparams_metadata -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> -    NULL_TENSOR = tensor_util.make_tensor_proto( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto -    numpy_dtype = dtypes.as_dtype(nparray.dtype) -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype -    if type_value.type == np.string_ or type_value.type == np.unicode_: -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ -    raise AttributeError( -AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? + + $ casanovo sequence --help + +Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 + + De novo sequence peptides from tandem mass spectra.                             + PEAK_PATH must be one or more mzML, mzXML, or MGF files from which to sequence  + peptides.                                                                       + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--model-mFILE                        The model weights (.ckpt file).  +                                              If not provided, Casanovo will   +                                              try to download the latest       +                                              release.                         +--output-oFILE                        The mzTab file to which results  +                                              will be written.                 +--config-cFILE                        The YAML configuration file      +                                              overriding the default options.  +--verbosity-v[debug|info|warning|error]  Set the verbosity of console     +                                              logging messages. Log files are  +                                              always set to 'debug'.           +--help-h  Show this message and exit.      +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/train-help.svg b/docs/images/train-help.svg index a71b8915..58251215 100644 --- a/docs/images/train-help.svg +++ b/docs/images/train-help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - - $ casanovo train --help -Traceback (most recent call last): -  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> -    from casanovo.casanovo import main -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> -    import depthcharge -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> -    from . import components -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> -    from .transformers import SpectrumEncoder, PeptideDecoder -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> -    from .. import utils -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> -    from tensorboard.backend.event_processing.event_accumulator import ( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> -    from tensorboard.backend.event_processing import event_file_loader -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> -    from tensorboard import dataclass_compat -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> -    from tensorboard.plugins.hparams import metadata as hparams_metadata -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> -    NULL_TENSOR = tensor_util.make_tensor_proto( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto -    numpy_dtype = dtypes.as_dtype(nparray.dtype) -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype -    if type_value.type == np.string_ or type_value.type == np.unicode_: -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ -    raise AttributeError( -AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? + + $ casanovo train --help + +Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              + + Train a Casanovo model on your own data.                                        + TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  + by MassIVE-KB, from which to train a new Casnovo model.                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  TRAIN_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +*--validation_peak_pa…-pFILE                    An annotated MGF file   +                                                       for validation, like    +                                                       from MassIVE-KB. Use    +                                                       this option multiple    +                                                       times to specify        +                                                       multiple files.         +[required]             +--model-mFILE                    The model weights       +                                                       (.ckpt file). If not    +                                                       provided, Casanovo      +                                                       will try to download    +                                                       the latest release.     +--output-oFILE                    The mzTab file to       +                                                       which results will be   +                                                       written.                +--config-cFILE                    The YAML configuration  +                                                       file overriding the     +                                                       default options.        +--verbosity-v[debug|info|warning|er  Set the verbosity of    +ror]  console logging         +                                                       messages. Log files     +                                                       are always set to       +                                                       'debug'.                +--help-h  Show this message and   +                                                       exit.                   +╰──────────────────────────────────────────────────────────────────────────────╯ + From 812226e396f667f2d9e628e1aabd76546f8c18a1 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 20 Aug 2024 20:21:29 -0700 Subject: [PATCH 31/84] finish proteindatabase --- casanovo/data/db_utils.py | 101 +++++++++++++++++---------------- casanovo/denovo/dataloaders.py | 6 +- tests/unit_tests/test_unit.py | 100 +++++++++++++++----------------- 3 files changed, 101 insertions(+), 106 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index a7b5e850..d249e0c7 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -1,12 +1,12 @@ """Unique methods used within db-search mode""" -import bisect import logging import os -from typing import List, Tuple +from typing import List import depthcharge.masses from numba import jit +import pandas as pd from pyteomics import fasta, parser logger = logging.getLogger("casanovo") @@ -28,11 +28,29 @@ class ProteinDatabase: """ - TODO + Store digested .fasta data and return candidate peptides for a given precursor mass. Parameters ---------- - TODO + fasta_path : str + Path to the FASTA file. + enzyme : str + The enzyme to use for digestion. + See pyteomics.parser.expasy_rules for valid enzymes. + digestion : str + The type of digestion to perform. Either 'full' or 'partial'. + missed_cleavages : int + The number of missed cleavages to allow. + min_peptide_len : int + The minimum length of peptides to consider. + max_peptide_len : int + The maximum length of peptides to consider. + max_mods : int + The maximum number of modifications to allow per peptide. + precursor_tolerance : float + The precursor mass tolerance in ppm. + isotope_error : List[int] + Isotopes to consider when comparing predicted and observed precursor m/z's. """ def __init__( @@ -73,27 +91,34 @@ def get_candidates( The precursor mass-to-charge ratio. charge : int The precursor charge. + + Returns + ------- + candidates : List[Tuple[str, str]] + A list of candidate peptides and associated + protein. """ - candidates = set() + candidates = [] for e in self.isotope_error: iso_shift = ISOTOPE_SPACING * e - upper_bound = ( + upper_bound = float( ProteinDatabase._to_raw_mass(precursor_mz, charge) - iso_shift ) * (1 + (self.precursor_tolerance / 1e6)) - lower_bound = ( + lower_bound = float( ProteinDatabase._to_raw_mass(precursor_mz, charge) - iso_shift ) * (1 - (self.precursor_tolerance / 1e6)) - start, end = ProteinDatabase._get_mass_indices( - [x[1] for x in self.digest], lower_bound, upper_bound - ) + window = self.digest[ + (self.digest["calc_mass"] >= lower_bound) + & (self.digest["calc_mass"] <= upper_bound) + ] + candidates.append(window[["peptide", "calc_mass", "protein"]]) - candidates.update(self.digest[start:end]) - - candidates = list(candidates) - candidates.sort(key=lambda x: x[1]) - return candidates + candidates = pd.concat(candidates) + candidates.drop_duplicates(inplace=True) + candidates.sort_values(by=["calc_mass", "peptide"], inplace=True) + return list(candidates["peptide"]), list(candidates["protein"]) def _digest_fasta( self, @@ -128,9 +153,9 @@ def _digest_fasta( Returns ------- - mod_peptide_list : List[Tuple[str, float, str]] - A list of tuples containing the peptide sequence, mass, - and associated protein. Sorted by neutral mass in ascending order. + mod_peptide_list : pd.DataFrame + A Pandas DataFrame with peptide, mass, + and protein columns. Sorted by neutral mass in ascending order. """ # Verify the existence of the file: if not os.path.isfile(fasta_filename): @@ -180,17 +205,20 @@ def _digest_fasta( map(ProteinDatabase._convert_from_modx, peptide_isoforms) ) mod_peptide_list.extend( - (mod_pep, mass_calculator.mass(mod_pep), prot) + [mod_pep, mass_calculator.mass(mod_pep), prot] for mod_pep in peptide_isoforms ) - # Sort the peptides by mass and return. - mod_peptide_list.sort(key=lambda x: x[1]) - logger.info( - "Digestion complete. %d peptides generated.", len(mod_peptide_list) + # Create a DataFrame for easy sorting and filtering + pdb_df = pd.DataFrame( + mod_peptide_list, columns=["peptide", "calc_mass", "protein"] ) - return mod_peptide_list + pdb_df.sort_values(by=["calc_mass", "peptide"], inplace=True) + + logger.info("Digestion complete. %d peptides generated.", len(pdb_df)) + return pdb_df + @jit def _to_mz(precursor_mass, charge): """ Convert precursor neutral mass to m/z value. @@ -209,6 +237,7 @@ def _to_mz(precursor_mass, charge): """ return (precursor_mass + (charge * PROTON)) / charge + @jit def _to_raw_mass(mz_mass, charge): """ Convert precursor m/z value to neutral mass. @@ -227,30 +256,6 @@ def _to_raw_mass(mz_mass, charge): """ return charge * (mz_mass - PROTON) - def _get_mass_indices(masses, m_low, m_high): - """Grabs mass indices that fall within a specified range. - - Pulls from masses, a list of mass values. - Requires that the mass values are sorted in ascending order. - - Parameters - ---------- - masses : List[int] - List of mass values - m_low : int - Lower bound of mass range (inclusive) - m_high : int - Upper bound of mass range (inclusive) - - Return - ------ - indices : Tuple[int, int] - Indices of mass values that fall within the specified range - """ - start = bisect.bisect_left(masses, m_low) - end = bisect.bisect_right(masses, m_high) - return start, end - def _convert_from_modx(seq: str): """Converts peptide sequence from modX format to Casanovo-acceptable modifications. diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 4d5524f4..2d9e200b 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -284,11 +284,11 @@ def prepare_psm_batch( all_proteins = [] for idx in range(len(batch)): digest_data = pdb.get_candidates( - precursor_mzs[idx], - precursor_charges[idx], + float(precursor_mzs[idx]), + float(precursor_charges[idx]), ) try: - spec_peptides, _, pep_protein = list(zip(*digest_data)) + spec_peptides, pep_protein = digest_data all_spectra.append( spectra[idx].unsqueeze(0).repeat(len(spec_peptides), 1, 1) ) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 7a37e771..2473a168 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -9,6 +9,7 @@ import einops import github import numpy as np +import pandas as pd import pytest import torch @@ -287,8 +288,7 @@ def test_digest_fasta_cleave(tiny_fasta_file): precursor_tolerance=20, isotope_error=[0], ) - peptide_list = pdb.digest - peptide_list = [x[0] for x in peptide_list] + peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected @@ -357,8 +357,7 @@ def test_digest_fasta_mods(tiny_fasta_file): precursor_tolerance=20, isotope_error=[0], ) - peptide_list = pdb.digest - peptide_list = [x[0] for x in peptide_list] + peptide_list = list(pdb.digest["peptide"]) peptide_list = [ x for x in peptide_list @@ -391,8 +390,7 @@ def test_length_restrictions(tiny_fasta_file): precursor_tolerance=20, isotope_error=[0], ) - peptide_list = pdb.digest - peptide_list = [x[0] for x in peptide_list] + peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_long pdb = db_utils.ProteinDatabase( @@ -406,8 +404,7 @@ def test_length_restrictions(tiny_fasta_file): precursor_tolerance=20, isotope_error=[0], ) - peptide_list = pdb.digest - peptide_list = [x[0] for x in peptide_list] + peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_short @@ -437,8 +434,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file): precursor_tolerance=20, isotope_error=[0], ) - peptide_list = pdb.digest - peptide_list = [x[0] for x in peptide_list] + peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_argc pdb = db_utils.ProteinDatabase( @@ -452,8 +448,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file): precursor_tolerance=20, isotope_error=[0], ) - peptide_list = pdb.digest - peptide_list = [x[0] for x in peptide_list] + peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_aspn @@ -478,8 +473,7 @@ def test_get_candidates(tiny_fasta_file): precursor_tolerance=10000, isotope_error=[0], ) - candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) - candidates = [x[0] for x in candidates] + candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_smallwindow == candidates pdb = db_utils.ProteinDatabase( @@ -493,8 +487,7 @@ def test_get_candidates(tiny_fasta_file): precursor_tolerance=150000, isotope_error=[0], ) - candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) - candidates = [x[0] for x in candidates] + candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_midwindow == candidates pdb = db_utils.ProteinDatabase( @@ -508,8 +501,7 @@ def test_get_candidates(tiny_fasta_file): precursor_tolerance=600000, isotope_error=[0], ) - candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) - candidates = [x[0] for x in candidates] + candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_widewindow == candidates @@ -522,35 +514,38 @@ def test_get_candidates_isotope_error(tiny_fasta_file): # 3: [977.510108, 997.257787] peptide_list = [ - ("A", 1001), - ("B", 1000), - ("C", 999), - ("D", 998), - ("E", 997), - ("F", 996), - ("G", 995), - ("H", 994), - ("I", 993), - ("J", 992), - ("K", 991), - ("L", 990), - ("M", 989), - ("N", 988), - ("O", 987), - ("P", 986), - ("Q", 985), - ("R", 984), - ("S", 983), - ("T", 982), - ("U", 981), - ("V", 980), - ("W", 979), - ("X", 978), - ("Y", 977), - ("Z", 976), + ("A", 1001, "foo"), + ("B", 1000, "foo"), + ("C", 999, "foo"), + ("D", 998, "foo"), + ("E", 997, "foo"), + ("F", 996, "foo"), + ("G", 995, "foo"), + ("H", 994, "foo"), + ("I", 993, "foo"), + ("J", 992, "foo"), + ("K", 991, "foo"), + ("L", 990, "foo"), + ("M", 989, "foo"), + ("N", 988, "foo"), + ("O", 987, "foo"), + ("P", 986, "foo"), + ("Q", 985, "foo"), + ("R", 984, "foo"), + ("S", 983, "foo"), + ("T", 982, "foo"), + ("U", 981, "foo"), + ("V", 980, "foo"), + ("W", 979, "foo"), + ("X", 978, "foo"), + ("Y", 977, "foo"), + ("Z", 976, "foo"), ] - peptide_list.sort(key=lambda x: x[1]) + peptide_list = pd.DataFrame( + peptide_list, columns=["peptide", "calc_mass", "protein"] + ) + peptide_list.sort_values("calc_mass", inplace=True) expected_isotope0 = list("UTSRQPONMLKJIHGFEDCB") expected_isotope1 = list("VUTSRQPONMLKJIHGFEDC") @@ -570,8 +565,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file): isotope_error=[0], ) pdb.digest = peptide_list - candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) - candidates = [x[0] for x in candidates] + candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope0 == candidates pdb = db_utils.ProteinDatabase( @@ -586,8 +580,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file): isotope_error=[1], ) pdb.digest = peptide_list - candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) - candidates = [x[0] for x in candidates] + candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope1 == candidates pdb = db_utils.ProteinDatabase( @@ -602,8 +595,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file): isotope_error=[2], ) pdb.digest = peptide_list - candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) - candidates = [x[0] for x in candidates] + candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope2 == candidates pdb = db_utils.ProteinDatabase( @@ -618,8 +610,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file): isotope_error=[3], ) pdb.digest = peptide_list - candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) - candidates = [x[0] for x in candidates] + candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope3 == candidates pdb = db_utils.ProteinDatabase( @@ -634,8 +625,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file): isotope_error=[0, 1, 2, 3], ) pdb.digest = peptide_list - candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) - candidates = [x[0] for x in candidates] + candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope0123 == candidates From cfd39e80b4898077f92cacc6491a5c891c5a9454 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Fri, 23 Aug 2024 14:12:50 -0700 Subject: [PATCH 32/84] all comments addressed --- casanovo/config.yaml | 7 +++- casanovo/data/db_utils.py | 68 +++++++++++++++++++++++++++------ casanovo/denovo/model_runner.py | 1 + tests/conftest.py | 4 ++ tests/unit_tests/test_unit.py | 56 +++++++++++++++++++++++++++ 5 files changed, 123 insertions(+), 13 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 860cfabb..87795db8 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -46,7 +46,7 @@ devices: # See pyteomics.parser.expasy_rules for valid enzymes enzyme: "trypsin" # Digestion type for candidate peptide generation. -# Full: standard digestion. Semi: Include products of semi-specific cleavage +# full: standard digestion. semi: Include products of semi-specific cleavage digestion: "full" # Number of allowed missed cleavages when digesting protein missed_cleavages: 0 @@ -55,6 +55,11 @@ missed_cleavages: 0 max_mods: # Maximum peptide length to consider max_peptide_len: 50 +# Toggle allowed modifications on/off +# Permanent fixed mod (don't include): C+57.021 +# Allowed variable mods: M+15.995, N+0.984, Q+0.984, +# Allowed N-terminal mods: +42.011, +43.006, -17.027, +43.006-17.027 +allowed_mods: "M+15.995,N+0.984,Q+0.984,+42.011,+43.006,-17.027,+43.006-17.027" ### diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index d249e0c7..2bdf3828 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -15,16 +15,6 @@ PROTON = 1.00727646677 ISOTOPE_SPACING = 1.003355 -var_mods = { - "d": ["N", "Q"], - "ox": ["M"], - "ace-": True, - "carb-": True, - "nh3x-": True, - "carbnh3x-": True, -} -fixed_mods = {"carbm": ["C"]} - class ProteinDatabase: """ @@ -51,6 +41,8 @@ class ProteinDatabase: The precursor mass tolerance in ppm. isotope_error : List[int] Isotopes to consider when comparing predicted and observed precursor m/z's. + allowed_mods : List[str] + A list of allowed modifications to consider. """ def __init__( @@ -64,7 +56,11 @@ def __init__( max_mods: int, precursor_tolerance: float, isotope_error: List[int], + allowed_mods: List[str], ): + self.fixed_mods, self.var_mods = self._construct_mods_dict( + allowed_mods + ) self.digest = self._digest_fasta( fasta_path, enzyme, @@ -197,8 +193,8 @@ def _digest_fasta( for pep, prot in peptide_list: peptide_isoforms = parser.isoforms( pep, - variable_mods=var_mods, - fixed_mods=fixed_mods, + variable_mods=self.var_mods, + fixed_mods=self.fixed_mods, max_mods=max_mods, ) peptide_isoforms = list( @@ -218,6 +214,54 @@ def _digest_fasta( logger.info("Digestion complete. %d peptides generated.", len(pdb_df)) return pdb_df + def _construct_mods_dict(self, allowed_mods): + """ + Constructs dictionaries of fixed and variable modifications. + + Parameters + ---------- + allowed_mods : str + A comma-separated list of allowed modifications. + + Returns + ------- + fixed_mods : dict + A dictionary of fixed modifications. + var_mods : dict + A dictionary of variable modifications. + """ + fixed_mods = {"carbm": ["C"]} + var_mods = {} + + if allowed_mods is "" or None: + return fixed_mods, var_mods + for mod in allowed_mods.split(","): + if mod == "M+15.995": + if "ox" not in var_mods: + var_mods["ox"] = [] + var_mods["ox"].append("M") + elif mod == "N+0.984": + if "d" not in var_mods: + var_mods["d"] = [] + var_mods["d"].append("N") + elif mod == "Q+0.984": + if "d" not in var_mods: + var_mods["d"] = [] + var_mods["d"].append("Q") + elif mod == "+42.011": + var_mods["ace-"] = True + elif mod == "+43.006": + var_mods["carb-"] = True + elif mod == "-17.027": + var_mods["nh3x-"] = True + elif mod == "+43.006-17.027": + var_mods["carbnh3x-"] = True + else: + logger.error("Modification %s not recognized.", mod) + raise ValueError(f"Modification {mod} not recognized.") + + return fixed_mods, var_mods + @jit def _to_mz(precursor_mass, charge): """ diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index b90f06b0..789c960b 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -124,6 +124,7 @@ def db_search( self.config.max_mods, self.config.precursor_mass_tol, self.config.isotope_error_range, + self.config.allowed_mods, ) self.loaders.setup(stage="test", annotated=False) self.trainer.predict(self.model, self.loaders.db_dataloader()) diff --git a/tests/conftest.py b/tests/conftest.py index f20d7879..452316c8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -299,6 +299,10 @@ def tiny_config(tmp_path): "-17.027": -17.026549, "+43.006-17.027": 25.980265, }, + "allowed_mods": ( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), } cfg_file = tmp_path / "config.yml" diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 2473a168..a31e2024 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -287,6 +287,10 @@ def test_digest_fasta_cleave(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected @@ -356,6 +360,10 @@ def test_digest_fasta_mods(tiny_fasta_file): max_mods=1, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) peptide_list = [ @@ -389,6 +397,10 @@ def test_length_restrictions(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_long @@ -403,6 +415,10 @@ def test_length_restrictions(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_short @@ -433,6 +449,10 @@ def test_digest_fasta_enzyme(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_argc @@ -447,6 +467,10 @@ def test_digest_fasta_enzyme(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_aspn @@ -472,6 +496,10 @@ def test_get_candidates(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_smallwindow == candidates @@ -486,6 +514,10 @@ def test_get_candidates(tiny_fasta_file): max_mods=0, precursor_tolerance=150000, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_midwindow == candidates @@ -500,6 +532,10 @@ def test_get_candidates(tiny_fasta_file): max_mods=0, precursor_tolerance=600000, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_widewindow == candidates @@ -563,6 +599,10 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) pdb.digest = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) @@ -578,6 +618,10 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[1], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) pdb.digest = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) @@ -593,6 +637,10 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[2], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) pdb.digest = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) @@ -608,6 +656,10 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[3], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) pdb.digest = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) @@ -623,6 +675,10 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0, 1, 2, 3], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) pdb.digest = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) From 106c4ecc524c202a7624d6fa025afc82adac1a0c Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Wed, 28 Aug 2024 16:41:24 -0700 Subject: [PATCH 33/84] new comments addressed --- casanovo/config.yaml | 22 +-- casanovo/data/db_utils.py | 276 +++++++++++++++++--------------- casanovo/denovo/dataloaders.py | 22 +-- casanovo/denovo/model.py | 2 +- casanovo/denovo/model_runner.py | 2 +- tests/conftest.py | 25 ++- tests/test_integration.py | 4 +- tests/unit_tests/test_unit.py | 74 ++++----- 8 files changed, 209 insertions(+), 218 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 87795db8..6c9063f5 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -11,13 +11,13 @@ # Max absolute difference allowed with respect to observed precursor m/z. # denovo: Predictions outside the tolerance range are assigned a negative peptide score. -# db-search: Used to create mas windows for candidate generation. +# db-search: Select candidate peptides within the specified precursor m/z tolerance. precursor_mass_tol: 50 # ppm # Isotopes to consider when comparing predicted and observed precursor m/z's. isotope_error_range: [0, 1] -# The minimum length of predicted/scored peptides. +# The minimum length of considered peptides. min_peptide_len: 6 -# Number of spectra or psms in one inference batch. +# Number of spectra in one inference batch. predict_batch_size: 1024 @@ -43,21 +43,21 @@ devices: ### # Enzyme for in silico digestion, used to generate candidate peptides. -# See pyteomics.parser.expasy_rules for valid enzymes +# See pyteomics.parser.expasy_rules for valid enzymes. enzyme: "trypsin" # Digestion type for candidate peptide generation. -# full: standard digestion. semi: Include products of semi-specific cleavage +# full: standard digestion. semi: Include products of semi-specific cleavage. digestion: "full" -# Number of allowed missed cleavages when digesting protein +# Number of allowed missed cleavages when digesting protein. missed_cleavages: 0 -# Maximum number of amino acid modifications per peptide. +# Maximum number of amino acid modifications per peptide, # None generates all possible isoforms as candidates. -max_mods: -# Maximum peptide length to consider +max_mods: 0 +# Maximum peptide length to consider. max_peptide_len: 50 -# Toggle allowed modifications on/off +# Select which modifications from the vocabulary can be used in candidate creation. # Permanent fixed mod (don't include): C+57.021 -# Allowed variable mods: M+15.995, N+0.984, Q+0.984, +# Allowed variable mods: M+15.995, N+0.984, Q+0.984 # Allowed N-terminal mods: +42.011, +43.006, -17.027, +43.006-17.027 allowed_mods: "M+15.995,N+0.984,Q+0.984,+42.011,+43.006,-17.027,+43.006-17.027" diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 2bdf3828..c1d5e91e 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -2,11 +2,11 @@ import logging import os -from typing import List +from typing import List, Tuple import depthcharge.masses -from numba import jit import pandas as pd +from numba import njit from pyteomics import fasta, parser logger = logging.getLogger("casanovo") @@ -39,10 +39,10 @@ class ProteinDatabase: The maximum number of modifications to allow per peptide. precursor_tolerance : float The precursor mass tolerance in ppm. - isotope_error : List[int] - Isotopes to consider when comparing predicted and observed precursor m/z's. - allowed_mods : List[str] - A list of allowed modifications to consider. + isotope_error : Tuple[int, int] + Isotope range [min, max] to consider when comparing predicted and observed precursor m/z's. + allowed_mods : str + A comma separated string of allowed modifications to consider. """ def __init__( @@ -55,13 +55,11 @@ def __init__( max_peptide_len: int, max_mods: int, precursor_tolerance: float, - isotope_error: List[int], - allowed_mods: List[str], + isotope_error: Tuple[int, int], + allowed_mods: str, ): - self.fixed_mods, self.var_mods = self._construct_mods_dict( - allowed_mods - ) - self.digest = self._digest_fasta( + self.fixed_mods, self.var_mods = _construct_mods_dict(allowed_mods) + self.db_peptides = self._digest_fasta( fasta_path, enzyme, digestion, @@ -77,7 +75,7 @@ def get_candidates( self, precursor_mz: float, charge: int, - ): + ) -> List[Tuple[str, str]]: """ Returns a list of candidate peptides that fall within the specified mass range. @@ -96,18 +94,18 @@ def get_candidates( """ candidates = [] - for e in self.isotope_error: + for e in range(self.isotope_error[0], self.isotope_error[1] + 1): iso_shift = ISOTOPE_SPACING * e upper_bound = float( - ProteinDatabase._to_raw_mass(precursor_mz, charge) - iso_shift + _to_raw_mass(precursor_mz, charge) - iso_shift ) * (1 + (self.precursor_tolerance / 1e6)) lower_bound = float( - ProteinDatabase._to_raw_mass(precursor_mz, charge) - iso_shift + _to_raw_mass(precursor_mz, charge) - iso_shift ) * (1 - (self.precursor_tolerance / 1e6)) - window = self.digest[ - (self.digest["calc_mass"] >= lower_bound) - & (self.digest["calc_mass"] <= upper_bound) + window = self.db_peptides[ + (self.db_peptides["calc_mass"] >= lower_bound) + & (self.db_peptides["calc_mass"] <= upper_bound) ] candidates.append(window[["peptide", "calc_mass", "protein"]]) @@ -125,7 +123,7 @@ def _digest_fasta( max_mods: int, min_peptide_length: int, max_peptide_length: int, - ): + ) -> pd.DataFrame: """ Digests a FASTA file and returns the peptides, their masses, and associated protein. @@ -158,13 +156,18 @@ def _digest_fasta( logger.error("File %s does not exist.", fasta_filename) raise FileNotFoundError(f"File {fasta_filename} does not exist.") - fasta_data = fasta.read(fasta_filename) peptide_list = [] if digestion not in ["full", "partial"]: logger.error("Digestion type %s not recognized.", digestion) raise ValueError(f"Digestion type {digestion} not recognized.") + if enzyme not in parser.expasy_rules: + logger.error( + "Enzyme %s not recognized. Must be in pyteomics.parser.expasy_rules", + enzyme, + ) + raise ValueError(f"Enzyme {enzyme} not recognized.") semi = digestion == "partial" - for header, seq in fasta_data: + for header, seq in fasta.read(fasta_filename): pep_set = parser.cleave( seq, rule=parser.expasy_rules[enzyme], @@ -182,136 +185,143 @@ def _digest_fasta( aa in pep for aa in "BJOUXZ" ): # Check for incorrect AA letters logger.warn( - "Skipping peptide with ambiguous amino acids: %s", pep + "Skipping peptide with unknown amino acids: %s", pep ) continue peptide_list.append((pep, protein)) # Generate modified peptides mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") - mod_peptide_list = [] - for pep, prot in peptide_list: - peptide_isoforms = parser.isoforms( - pep, - variable_mods=self.var_mods, - fixed_mods=self.fixed_mods, - max_mods=max_mods, - ) - peptide_isoforms = list( - map(ProteinDatabase._convert_from_modx, peptide_isoforms) - ) - mod_peptide_list.extend( - [mod_pep, mass_calculator.mass(mod_pep), prot] - for mod_pep in peptide_isoforms + peptide_isoforms = [ + ( + parser.isoforms( + pep, + variable_mods=self.var_mods, + fixed_mods=self.fixed_mods, + max_mods=max_mods, + ), + prot, ) - + for pep, prot in peptide_list + ] + mod_peptide_list = [ + (mod_pep, mass_calculator.mass(mod_pep), prot) + for isos, prot in peptide_isoforms + for mod_pep in map(_convert_from_modx, isos) + ] # Create a DataFrame for easy sorting and filtering - pdb_df = pd.DataFrame( + pep_table = pd.DataFrame( mod_peptide_list, columns=["peptide", "calc_mass", "protein"] ) - pdb_df.sort_values(by=["calc_mass", "peptide"], inplace=True) - - logger.info("Digestion complete. %d peptides generated.", len(pdb_df)) - return pdb_df + pep_table.sort_values(by=["calc_mass", "peptide"], inplace=True) - def _construct_mods_dict(self, allowed_mods): - """ - Constructs dictionaries of fixed and variable modifications. + logger.info( + "Digestion complete. %d peptides generated.", len(pep_table) + ) + return pep_table - Parameters - ---------- - allowed_mods : str - A comma-separated list of allowed modifications. - Returns - ------- - fixed_mods : dict - A dictionary of fixed modifications. - var_mods : dict - A dictionary of variable modifications. - """ - fixed_mods = {"carbm": ["C"]} - var_mods = {} - - if allowed_mods is "" or None: - return fixed_mods, var_mods - for mod in allowed_mods.split(","): - if mod == "M+15.995": - if "ox" not in var_mods: - var_mods["ox"] = [] - var_mods["ox"].append("M") - elif mod == "N+0.984": - if "d" not in var_mods: - var_mods["d"] = [] - var_mods["d"].append("N") - elif mod == "Q+0.984": - if "d" not in var_mods: - var_mods["d"] = [] - var_mods["d"].append("Q") - elif mod == "+42.011": - var_mods["ace-"] = True - elif mod == "+43.006": - var_mods["carb-"] = True - elif mod == "-17.027": - var_mods["nh3x-"] = True - elif mod == "+43.006-17.027": - var_mods["carbnh3x-"] = True - else: - logger.error("Modification %s not recognized.", mod) - raise ValueError(f"Modification {mod} not recognized.") +@njit +def _to_mz(precursor_mass, charge): + """ + Convert precursor neutral mass to m/z value. - return fixed_mods, var_mods + Parameters + ---------- + precursor_mass : float + The precursor neutral mass. + charge : int + The precursor charge. + + Returns + ------- + mz : float + The calculated precursor mass-to-charge ratio. + """ + return (precursor_mass + (charge * PROTON)) / charge - @jit - def _to_mz(precursor_mass, charge): - """ - Convert precursor neutral mass to m/z value. - Parameters - ---------- - precursor_mass : float - The precursor neutral mass. - charge : int - The precursor charge. +@njit +def _to_raw_mass(mz_mass, charge): + """ + Convert precursor m/z value to neutral mass. - Returns - ------- - mz : float - The calculated precursor mass-to-charge ratio. - """ - return (precursor_mass + (charge * PROTON)) / charge + Parameters + ---------- + mz_mass : float + The precursor mass-to-charge ratio. + charge : int + The precursor charge. + + Returns + ------- + mass : float + The calculated precursor neutral mass. + """ + return charge * (mz_mass - PROTON) - @jit - def _to_raw_mass(mz_mass, charge): - """ - Convert precursor m/z value to neutral mass. - Parameters - ---------- - mz_mass : float - The precursor mass-to-charge ratio. - charge : int - The precursor charge. +def _convert_from_modx(seq: str): + """Converts peptide sequence from modX format to Casanovo-acceptable modifications. - Returns - ------- - mass : float - The calculated precursor neutral mass. - """ - return charge * (mz_mass - PROTON) + Args: + seq (str): Peptide in modX format + """ + seq = seq.replace("carbmC", "C+57.021") # Fixed modification + seq = seq.replace("oxM", "M+15.995") + seq = seq.replace("dN", "N+0.984") + seq = seq.replace("dQ", "Q+0.984") + seq = seq.replace("ace-", "+42.011") + seq = seq.replace("carbnh3x-", "+43.006-17.027") + seq = seq.replace("carb-", "+43.006") + seq = seq.replace("nh3x-", "-17.027") + return seq + + +def _construct_mods_dict(allowed_mods): + """ + Constructs dictionaries of fixed and variable modifications. - def _convert_from_modx(seq: str): - """Converts peptide sequence from modX format to Casanovo-acceptable modifications. + Parameters + ---------- + allowed_mods : str + A comma-separated list of allowed modifications. + + Returns + ------- + fixed_mods : dict + A dictionary of fixed modifications. + var_mods : dict + A dictionary of variable modifications. + """ + fixed_mods = {"carbm": ["C"]} + var_mods = {} - Args: - seq (str): Peptide in modX format - """ - seq = seq.replace("carbmC", "C+57.021") # Fixed modification - seq = seq.replace("oxM", "M+15.995") - seq = seq.replace("dN", "N+0.984") - seq = seq.replace("dQ", "Q+0.984") - seq = seq.replace("ace-", "+42.011") - seq = seq.replace("carbnh3x-", "+43.006-17.027") - seq = seq.replace("carb-", "+43.006") - seq = seq.replace("nh3x-", "-17.027") - return seq + if not allowed_mods: + return fixed_mods, var_mods + for mod in allowed_mods.split(","): + if mod == "M+15.995": + if "ox" not in var_mods: + var_mods["ox"] = [] + var_mods["ox"].append("M") + elif mod == "N+0.984": + if "d" not in var_mods: + var_mods["d"] = [] + var_mods["d"].append("N") + elif mod == "Q+0.984": + if "d" not in var_mods: + var_mods["d"] = [] + var_mods["d"].append("Q") + elif mod == "+42.011": + var_mods["ace-"] = True + elif mod == "+43.006": + var_mods["carb-"] = True + elif mod == "-17.027": + var_mods["nh3x-"] = True + elif mod == "+43.006-17.027": + var_mods["carbnh3x-"] = True + else: + logger.error("Modification %s not recognized.", mod) + raise ValueError(f"Modification {mod} not recognized.") + + return fixed_mods, var_mods diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 2d9e200b..a6ab8ddc 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -1,14 +1,14 @@ """Data loaders for the de novo sequencing task.""" import functools -import os import logging +import os from typing import List, Optional, Tuple -from depthcharge.data import AnnotatedSpectrumIndex import lightning.pytorch as pl import numpy as np import torch +from depthcharge.data import AnnotatedSpectrumIndex from ..data import db_utils from ..data.datasets import ( @@ -89,7 +89,7 @@ def __init__( self.train_dataset = None self.valid_dataset = None self.test_dataset = None - self.pdb = None + self.protein_database = None def setup(self, stage: str = None, annotated: bool = True) -> None: """ @@ -187,7 +187,9 @@ def db_dataloader(self) -> torch.utils.data.DataLoader: return torch.utils.data.DataLoader( self.test_dataset, batch_size=self.eval_batch_size, - collate_fn=functools.partial(prepare_psm_batch, pdb=self.pdb), + collate_fn=functools.partial( + prepare_psm_batch, protein_database=self.protein_database + ), pin_memory=True, num_workers=self.n_workers, shuffle=False, @@ -235,8 +237,8 @@ def prepare_batch( def prepare_psm_batch( batch: List[Tuple[torch.Tensor, float, int, str]], - pdb: db_utils.ProteinDatabase, -): + protein_database: db_utils.ProteinDatabase, +) -> Tuple[torch.Tensor, torch.Tensor, np.ndarray, List[str], List[str]]: """ Collate MS/MS spectra into a batch for DB search. @@ -249,7 +251,7 @@ def prepare_psm_batch( A batch of data from an AnnotatedSpectrumDataset, consisting of for each spectrum (i) a tensor with the m/z and intensity peak values, (ii), the precursor m/z, (iii) the precursor charge, (iv) the spectrum identifier. - pdb : db_utils.ProteinDatabase + protein_database : db_utils.ProteinDatabase The protein database to use for candidate peptide retrieval. Returns @@ -283,9 +285,9 @@ def prepare_psm_batch( all_peptides = [] all_proteins = [] for idx in range(len(batch)): - digest_data = pdb.get_candidates( - float(precursor_mzs[idx]), - float(precursor_charges[idx]), + digest_data = protein_database.get_candidates( + precursor_mzs[idx].type(torch.float64).item(), + precursor_charges[idx].type(torch.int64).item(), ) try: spec_peptides, pep_protein = digest_data diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 79848682..b38a27c0 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -991,7 +991,7 @@ def configure_optimizers( class DbSpec2Pep(Spec2Pep): """ - Subclass of Spec2Pep for the use of Casanovo as an \ + Subclass of Spec2Pep for the use of Casanovo as an MS/MS database search score function. Uses teacher forcing to 'query' Casanovo for its score for each AA diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 789c960b..6928560d 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -114,7 +114,7 @@ def db_search( self.writer.set_ms_run(test_index.ms_files) self.initialize_data_module(test_index=test_index) - self.loaders.pdb = db_utils.ProteinDatabase( + self.loaders.protein_database = db_utils.ProteinDatabase( fasta_path, self.config.enzyme, self.config.digestion, diff --git a/tests/conftest.py b/tests/conftest.py index 452316c8..90e522fe 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -28,7 +28,7 @@ def tiny_fasta_file(tmp_path): @pytest.fixture -def mgf_db_search(tmp_path): +def mgf_medium(tmp_path): """An MGF file with 7 spectra and scan numbers, C+57.021 mass modification considered""" peptides = [ "ATSIPAR", @@ -40,10 +40,10 @@ def mgf_db_search(tmp_path): "FSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP", ] mgf_file = tmp_path / "db_search.mgf" - return _create_mgf(peptides, mgf_file, c_mod=True) + return _create_mgf(peptides, mgf_file, mod_aa_mass={"C": 160.030649}) -def _create_mgf(peptides, mgf_file, random_state=42, c_mod=False): +def _create_mgf(peptides, mgf_file, random_state=42, mod_aa_mass=None): """ Create a fake MGF file from one or more peptides. @@ -55,9 +55,9 @@ def _create_mgf(peptides, mgf_file, random_state=42, c_mod=False): The MGF file to create. random_state : int or numpy.random.Generator, optional The random seed. The charge states are chosen to be 2 or 3 randomly. - c_mod : bool, optional - Whether to use the constant carbamidomethylation - of C in mass calculations. + mod_aa_mass : dict, optional + A dictionary that specifies the modified masses of amino acids. + e.g. {"C": 160.030649} for carbamidomethylated C. Returns ------- @@ -65,7 +65,7 @@ def _create_mgf(peptides, mgf_file, random_state=42, c_mod=False): """ rng = np.random.default_rng(random_state) entries = [ - _create_mgf_entry(p, rng.choice([2, 3]), c_mod) for p in peptides + _create_mgf_entry(p, rng.choice([2, 3]), mod_aa_mass) for p in peptides ] with mgf_file.open("w+") as mgf_ref: mgf_ref.write("\n".join(entries)) @@ -73,7 +73,7 @@ def _create_mgf(peptides, mgf_file, random_state=42, c_mod=False): return mgf_file -def _create_mgf_entry(peptide, charge=2, c_mod=False): +def _create_mgf_entry(peptide, charge=2, mod_aa_mass=None): """ Create a MassIVE-KB style MGF entry for a single PSM. @@ -83,20 +83,19 @@ def _create_mgf_entry(peptide, charge=2, c_mod=False): A peptide sequence. charge : int, optional The peptide charge state. - c_mod : bool, optional - Whether to use the constant carbamidomethylation - of C in mass calculations. + mod_aa_mass : dict, optional + A dictionary that specifies the modified masses of amino acids. Returns ------- str The PSM entry in an MGF file format. """ - if not c_mod: + if mod_aa_mass is None: precursor_mz = calculate_mass(peptide, charge=int(charge)) else: aa_mass = std_aa_mass - aa_mass.update({"C": 160.030649}) # Carbamidomethylated C mass + aa_mass.update(mod_aa_mass) precursor_mz = fast_mass(peptide, charge=int(charge), aa_mass=aa_mass) mzs, intensities = _peptide_to_peaks(peptide, charge) frags = "\n".join([f"{m} {i}" for m, i in zip(mzs, intensities)]) diff --git a/tests/test_integration.py b/tests/test_integration.py index 61f735c3..4275d792 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -8,7 +8,7 @@ def test_db_search( - mgf_db_search, tiny_fasta_file, tiny_config, tmp_path, monkeypatch + mgf_medium, tiny_fasta_file, tiny_config, tmp_path, monkeypatch ): # Run a command: monkeypatch.setattr(casanovo, "__version__", "4.1.0") @@ -24,7 +24,7 @@ def test_db_search( tiny_config, "--output", str(output_path), - str(mgf_db_search), + str(mgf_medium), str(tiny_fasta_file), ] diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index a31e2024..51d9a3c9 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -286,13 +286,13 @@ def test_digest_fasta_cleave(tiny_fasta_file): max_peptide_len=50, max_mods=0, precursor_tolerance=20, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - peptide_list = list(pdb.digest["peptide"]) + peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected @@ -359,13 +359,13 @@ def test_digest_fasta_mods(tiny_fasta_file): max_peptide_len=50, max_mods=1, precursor_tolerance=20, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - peptide_list = list(pdb.digest["peptide"]) + peptide_list = list(pdb.db_peptides["peptide"]) peptide_list = [ x for x in peptide_list @@ -396,13 +396,13 @@ def test_length_restrictions(tiny_fasta_file): max_peptide_len=50, max_mods=0, precursor_tolerance=20, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - peptide_list = list(pdb.digest["peptide"]) + peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_long pdb = db_utils.ProteinDatabase( @@ -414,13 +414,13 @@ def test_length_restrictions(tiny_fasta_file): max_peptide_len=8, max_mods=0, precursor_tolerance=20, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - peptide_list = list(pdb.digest["peptide"]) + peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_short @@ -448,13 +448,13 @@ def test_digest_fasta_enzyme(tiny_fasta_file): max_peptide_len=50, max_mods=0, precursor_tolerance=20, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - peptide_list = list(pdb.digest["peptide"]) + peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_argc pdb = db_utils.ProteinDatabase( @@ -466,13 +466,13 @@ def test_digest_fasta_enzyme(tiny_fasta_file): max_peptide_len=50, max_mods=0, precursor_tolerance=20, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - peptide_list = list(pdb.digest["peptide"]) + peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_aspn @@ -495,7 +495,7 @@ def test_get_candidates(tiny_fasta_file): max_peptide_len=50, max_mods=0, precursor_tolerance=10000, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" @@ -513,7 +513,7 @@ def test_get_candidates(tiny_fasta_file): max_peptide_len=50, max_mods=0, precursor_tolerance=150000, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" @@ -531,7 +531,7 @@ def test_get_candidates(tiny_fasta_file): max_peptide_len=50, max_mods=0, precursor_tolerance=600000, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" @@ -584,9 +584,8 @@ def test_get_candidates_isotope_error(tiny_fasta_file): peptide_list.sort_values("calc_mass", inplace=True) expected_isotope0 = list("UTSRQPONMLKJIHGFEDCB") - expected_isotope1 = list("VUTSRQPONMLKJIHGFEDC") - expected_isotope2 = list("WVUTSRQPONMLKJIHGFED") - expected_isotope3 = list("XWVUTSRQPONMLKJIHGFE") + expected_isotope01 = list("VUTSRQPONMLKJIHGFEDCB") + expected_isotope012 = list("WVUTSRQPONMLKJIHGFEDCB") expected_isotope0123 = list("XWVUTSRQPONMLKJIHGFEDCB") pdb = db_utils.ProteinDatabase( @@ -598,13 +597,13 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_peptide_len=0, max_mods=0, precursor_tolerance=10000, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - pdb.digest = peptide_list + pdb.db_peptides = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope0 == candidates @@ -617,15 +616,15 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_peptide_len=0, max_mods=0, precursor_tolerance=10000, - isotope_error=[1], + isotope_error=[0, 1], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - pdb.digest = peptide_list + pdb.db_peptides = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_isotope1 == candidates + assert expected_isotope01 == candidates pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), @@ -636,15 +635,15 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_peptide_len=0, max_mods=0, precursor_tolerance=10000, - isotope_error=[2], + isotope_error=[0, 2], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - pdb.digest = peptide_list + pdb.db_peptides = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_isotope2 == candidates + assert expected_isotope012 == candidates pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), @@ -655,32 +654,13 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_peptide_len=0, max_mods=0, precursor_tolerance=10000, - isotope_error=[3], + isotope_error=[0, 3], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - pdb.digest = peptide_list - candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_isotope3 == candidates - - pdb = db_utils.ProteinDatabase( - fasta_path=str(tiny_fasta_file), - enzyme="trypsin", - digestion="full", - missed_cleavages=0, - min_peptide_len=0, - max_peptide_len=0, - max_mods=0, - precursor_tolerance=10000, - isotope_error=[0, 1, 2, 3], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" - ), - ) - pdb.digest = peptide_list + pdb.db_peptides = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope0123 == candidates From 0dfdb2cb89514a0189e20cf19c231363567a7c72 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Mon, 2 Sep 2024 17:48:31 -0700 Subject: [PATCH 34/84] final adjustments added --- casanovo/config.yaml | 19 ++-- casanovo/data/db_utils.py | 158 +++++++++++++++++++------------- casanovo/denovo/dataloaders.py | 7 +- casanovo/denovo/model.py | 4 +- casanovo/denovo/model_runner.py | 17 ++-- tests/conftest.py | 43 ++++++++- tests/unit_tests/test_unit.py | 151 +++++++++++++++++++----------- 7 files changed, 254 insertions(+), 145 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 6c9063f5..af2f79d1 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -17,6 +17,8 @@ precursor_mass_tol: 50 # ppm isotope_error_range: [0, 1] # The minimum length of considered peptides. min_peptide_len: 6 +# The maximum length of considered peptides. +max_length: 100 # Number of spectra in one inference batch. predict_batch_size: 1024 @@ -47,19 +49,20 @@ devices: enzyme: "trypsin" # Digestion type for candidate peptide generation. # full: standard digestion. semi: Include products of semi-specific cleavage. +# Can also take a regex expression to specify custom digestion rules. digestion: "full" # Number of allowed missed cleavages when digesting protein. missed_cleavages: 0 # Maximum number of amino acid modifications per peptide, # None generates all possible isoforms as candidates. -max_mods: 0 -# Maximum peptide length to consider. -max_peptide_len: 50 +max_mods: 1 # Select which modifications from the vocabulary can be used in candidate creation. -# Permanent fixed mod (don't include): C+57.021 -# Allowed variable mods: M+15.995, N+0.984, Q+0.984 -# Allowed N-terminal mods: +42.011, +43.006, -17.027, +43.006-17.027 -allowed_mods: "M+15.995,N+0.984,Q+0.984,+42.011,+43.006,-17.027,+43.006-17.027" +# Format: Comma-separated list of "aa:mod_residue", +# where aa is a standard amino acid or "X" for an N-terminal mod +# and mod_residue is a key from the "residues" dictionary. +# Example: "M:M+15.995,X:+43.006-17.027" +allowed_fixed_mods: "C:C+57.021" +allowed_var_mods: "M:M+15.995,N:N+0.984,Q:Q+0.984,X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ### @@ -111,8 +114,6 @@ dropout: 0.0 # Number of dimensions to use for encoding peak intensity. # Projected up to `dim_model` by default and summed with the peak m/z encoding. dim_intensity: -# Max decoded peptide length. -max_length: 100 # The number of iterations for the linear warm-up of the learning rate. warmup_iters: 100_000 # The number of iterations for the cosine half period of the learning rate. diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index c1d5e91e..c9201538 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -1,13 +1,17 @@ """Unique methods used within db-search mode""" +import functools import logging import os +import re +import string from typing import List, Tuple import depthcharge.masses import pandas as pd +import pyteomics.fasta as fasta +import pyteomics.parser as parser from numba import njit -from pyteomics import fasta, parser logger = logging.getLogger("casanovo") @@ -41,8 +45,12 @@ class ProteinDatabase: The precursor mass tolerance in ppm. isotope_error : Tuple[int, int] Isotope range [min, max] to consider when comparing predicted and observed precursor m/z's. - allowed_mods : str - A comma separated string of allowed modifications to consider. + allowed_fixed_mods : str + A comma separated string of fixed modifications to consider. + allowed_var_mods : str + A comma separated string of variable modifications to consider. + residues : dict + A dictionary of amino acid masses. """ def __init__( @@ -56,9 +64,14 @@ def __init__( max_mods: int, precursor_tolerance: float, isotope_error: Tuple[int, int], - allowed_mods: str, + allowed_fixed_mods: str, + allowed_var_mods: str, + residues: dict, ): - self.fixed_mods, self.var_mods = _construct_mods_dict(allowed_mods) + self.residues = residues + self.fixed_mods, self.var_mods, self.swap_map = _construct_mods_dict( + allowed_fixed_mods, allowed_var_mods + ) self.db_peptides = self._digest_fasta( fasta_path, enzyme, @@ -88,20 +101,22 @@ def get_candidates( Returns ------- - candidates : List[Tuple[str, str]] - A list of candidate peptides and associated - protein. + candidates : pd.Series + A series of candidate peptides. """ candidates = [] for e in range(self.isotope_error[0], self.isotope_error[1] + 1): iso_shift = ISOTOPE_SPACING * e - upper_bound = float( - _to_raw_mass(precursor_mz, charge) - iso_shift - ) * (1 + (self.precursor_tolerance / 1e6)) - lower_bound = float( + shift_raw_mass = float( _to_raw_mass(precursor_mz, charge) - iso_shift - ) * (1 - (self.precursor_tolerance / 1e6)) + ) + upper_bound = shift_raw_mass * ( + 1 + (self.precursor_tolerance / 1e6) + ) + lower_bound = shift_raw_mass * ( + 1 - (self.precursor_tolerance / 1e6) + ) window = self.db_peptides[ (self.db_peptides["calc_mass"] >= lower_bound) @@ -112,7 +127,25 @@ def get_candidates( candidates = pd.concat(candidates) candidates.drop_duplicates(inplace=True) candidates.sort_values(by=["calc_mass", "peptide"], inplace=True) - return list(candidates["peptide"]), list(candidates["protein"]) + return candidates["peptide"], candidates["protein"] + + def get_associated_protein(self, peptide: str) -> str: + """ + Returns the associated protein for a given peptide. + + Parameters + ---------- + peptide : str + The peptide sequence. + + Returns + ------- + protein : str + The associated protein. + """ + return self.db_peptides[self.db_peptides["peptide"] == peptide][ + "protein" + ].values[0] def _digest_fasta( self, @@ -161,16 +194,18 @@ def _digest_fasta( logger.error("Digestion type %s not recognized.", digestion) raise ValueError(f"Digestion type {digestion} not recognized.") if enzyme not in parser.expasy_rules: - logger.error( - "Enzyme %s not recognized. Must be in pyteomics.parser.expasy_rules", + logger.info( + "Enzyme %s not recognized. Interpreting as cleavage rule.", enzyme, ) - raise ValueError(f"Enzyme {enzyme} not recognized.") semi = digestion == "partial" + valid_aa = set( + [re.sub(r"[^A-Z]+", "", res) for res in self.residues.keys()] + ) for header, seq in fasta.read(fasta_filename): pep_set = parser.cleave( seq, - rule=parser.expasy_rules[enzyme], + rule=enzyme, missed_cleavages=missed_cleavages, semi=semi, ) @@ -181,9 +216,8 @@ def _digest_fasta( or len(pep) > max_peptide_length ): continue - if any( - aa in pep for aa in "BJOUXZ" - ): # Check for incorrect AA letters + + if any(aa not in valid_aa for aa in pep): logger.warn( "Skipping peptide with unknown amino acids: %s", pep ) @@ -207,7 +241,10 @@ def _digest_fasta( mod_peptide_list = [ (mod_pep, mass_calculator.mass(mod_pep), prot) for isos, prot in peptide_isoforms - for mod_pep in map(_convert_from_modx, isos) + for mod_pep in map( + functools.partial(_convert_from_modx, swap_map=self.swap_map), + isos, + ) ] # Create a DataFrame for easy sorting and filtering pep_table = pd.DataFrame( @@ -261,31 +298,29 @@ def _to_raw_mass(mz_mass, charge): return charge * (mz_mass - PROTON) -def _convert_from_modx(seq: str): +def _convert_from_modx(seq: str, swap_map: dict) -> str: """Converts peptide sequence from modX format to Casanovo-acceptable modifications. Args: - seq (str): Peptide in modX format + seq : str + Peptide in modX format + swap_map : dict + Dictionary that allows for swapping of modX to Casanovo-acceptable modifications. """ - seq = seq.replace("carbmC", "C+57.021") # Fixed modification - seq = seq.replace("oxM", "M+15.995") - seq = seq.replace("dN", "N+0.984") - seq = seq.replace("dQ", "Q+0.984") - seq = seq.replace("ace-", "+42.011") - seq = seq.replace("carbnh3x-", "+43.006-17.027") - seq = seq.replace("carb-", "+43.006") - seq = seq.replace("nh3x-", "-17.027") - return seq - - -def _construct_mods_dict(allowed_mods): + regex = re.compile("(%s)" % "|".join(map(re.escape, swap_map.keys()))) + return regex.sub(lambda x: swap_map[x.group()], seq) + + +def _construct_mods_dict(allowed_fixed_mods, allowed_var_mods): """ Constructs dictionaries of fixed and variable modifications. Parameters ---------- - allowed_mods : str - A comma-separated list of allowed modifications. + allowed_fixed_mods : str + A comma separated string of fixed modifications to consider. + allowed_var_mods : str + A comma separated string of variable modifications to consider. Returns ------- @@ -293,35 +328,26 @@ def _construct_mods_dict(allowed_mods): A dictionary of fixed modifications. var_mods : dict A dictionary of variable modifications. + swap_map : dict + A dictionary that allows for swapping of modX to Casanovo-acceptable modifications. """ - fixed_mods = {"carbm": ["C"]} - var_mods = {} + swap_map = {} + fixed_mods = {} + for idx, mod in enumerate(allowed_fixed_mods.split(",")): + aa, mod_aa = mod.split(":") + mod_id = string.ascii_lowercase[idx] + fixed_mods[mod_id] = [aa] + swap_map[f"{mod_id}{aa}"] = f"{mod_aa}" - if not allowed_mods: - return fixed_mods, var_mods - for mod in allowed_mods.split(","): - if mod == "M+15.995": - if "ox" not in var_mods: - var_mods["ox"] = [] - var_mods["ox"].append("M") - elif mod == "N+0.984": - if "d" not in var_mods: - var_mods["d"] = [] - var_mods["d"].append("N") - elif mod == "Q+0.984": - if "d" not in var_mods: - var_mods["d"] = [] - var_mods["d"].append("Q") - elif mod == "+42.011": - var_mods["ace-"] = True - elif mod == "+43.006": - var_mods["carb-"] = True - elif mod == "-17.027": - var_mods["nh3x-"] = True - elif mod == "+43.006-17.027": - var_mods["carbnh3x-"] = True + var_mods = {} + for idx, mod in enumerate(allowed_var_mods.split(",")): + aa, mod_aa = mod.split(":") + mod_id = string.ascii_lowercase[idx] + if aa == "X": + var_mods[f"{mod_id}-"] = True + swap_map[f"{mod_id}-"] = f"{mod_aa}" else: - logger.error("Modification %s not recognized.", mod) - raise ValueError(f"Modification {mod} not recognized.") + var_mods[mod_id] = [aa] + swap_map[f"{mod_id}{aa}"] = f"{mod_aa}" - return fixed_mods, var_mods + return fixed_mods, var_mods, swap_map diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index a6ab8ddc..6e8c93b3 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -267,7 +267,7 @@ def prepare_psm_batch( all_peptides : List[str] The candidate peptides for each spectrum. all_proteins : List[str] - The associated proteins for each candidate peptide. + The proteins associated with each candidate peptide. """ spectra, precursor_mzs, precursor_charges, spectrum_ids = list(zip(*batch)) spectra = torch.nn.utils.rnn.pad_sequence(spectra, batch_first=True) @@ -285,12 +285,11 @@ def prepare_psm_batch( all_peptides = [] all_proteins = [] for idx in range(len(batch)): - digest_data = protein_database.get_candidates( + spec_peptides, spec_proteins = protein_database.get_candidates( precursor_mzs[idx].type(torch.float64).item(), precursor_charges[idx].type(torch.int64).item(), ) try: - spec_peptides, pep_protein = digest_data all_spectra.append( spectra[idx].unsqueeze(0).repeat(len(spec_peptides), 1, 1) ) @@ -299,7 +298,7 @@ def prepare_psm_batch( ) all_spectrum_ids.extend([spectrum_ids[idx]] * len(spec_peptides)) all_peptides.extend(spec_peptides) - all_proteins.extend(pep_protein) + all_proteins.extend(spec_proteins) except ValueError: logger.warning( "No candidates found for spectrum %s", spectrum_ids[idx] diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index b38a27c0..dc7e5f7b 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1017,9 +1017,9 @@ def predict_step(self, batch, *args): Parameters ---------- - batch : Tuple[torch.Tensor, torch.Tensor, np.array, List[str], List[str]] + batch : Tuple[torch.Tensor, torch.Tensor, np.array, List[str]] A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - spectrum identifiers, (iv) candidate peptides, (v) associated proteins. + spectrum identifiers, (iv) candidate peptides, (v) associated protein. Returns ------- diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 6928560d..395320e5 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -110,22 +110,25 @@ def db_search( self.initialize_model(train=False, db_search=True) self.model.out_writer = self.writer self.model.psm_batch_size = self.config.predict_batch_size - test_index = self._get_index(peak_path, False, "db search") - self.writer.set_ms_run(test_index.ms_files) - - self.initialize_data_module(test_index=test_index) - self.loaders.protein_database = db_utils.ProteinDatabase( + self.model.protein_database = db_utils.ProteinDatabase( fasta_path, self.config.enzyme, self.config.digestion, self.config.missed_cleavages, self.config.min_peptide_len, - self.config.max_peptide_len, + self.config.max_length, self.config.max_mods, self.config.precursor_mass_tol, self.config.isotope_error_range, - self.config.allowed_mods, + self.config.allowed_fixed_mods, + self.config.allowed_var_mods, + self.config.residues, ) + test_index = self._get_index(peak_path, False, "db search") + self.writer.set_ms_run(test_index.ms_files) + + self.initialize_data_module(test_index=test_index) + self.loaders.protein_database = self.model.protein_database self.loaders.setup(stage="test", annotated=False) self.trainer.predict(self.model, self.loaders.db_dataloader()) diff --git a/tests/conftest.py b/tests/conftest.py index 90e522fe..3b94896a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -241,7 +241,7 @@ def tiny_config(tmp_path): "precursor_mass_tol": 5, "isotope_error_range": [0, 1], "min_peptide_len": 6, - "max_peptide_len": 50, + "max_length": 100, "enzyme": "trypsin", "digestion": "full", "missed_cleavages": 0, @@ -263,7 +263,6 @@ def tiny_config(tmp_path): "dim_model": 512, "dropout": 0.0, "dim_intensity": None, - "max_length": 100, "learning_rate": 5e-4, "weight_decay": 1e-5, "train_batch_size": 32, @@ -298,9 +297,10 @@ def tiny_config(tmp_path): "-17.027": -17.026549, "+43.006-17.027": 25.980265, }, - "allowed_mods": ( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + "allowed_fixed_mods": "C:C+57.021", + "allowed_var_mods": ( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), } @@ -311,6 +311,39 @@ def tiny_config(tmp_path): return cfg_file +@pytest.fixture +def residues_dict(): + return { + "G": 57.021464, + "A": 71.037114, + "S": 87.032028, + "P": 97.052764, + "V": 99.068414, + "T": 101.047670, + "C+57.021": 160.030649, + "L": 113.084064, + "I": 113.084064, + "N": 114.042927, + "D": 115.026943, + "Q": 128.058578, + "K": 128.094963, + "E": 129.042593, + "M": 131.040485, + "H": 137.058912, + "F": 147.068414, + "R": 156.101111, + "Y": 163.063329, + "W": 186.079313, + "M+15.995": 147.035400, + "N+0.984": 115.026943, + "Q+0.984": 129.042594, + "+42.011": 42.010565, + "+43.006": 43.005814, + "-17.027": -17.026549, + "+43.006-17.027": 25.980265, + } + + @pytest.fixture def tide_dir_small(tmp_path): """A directory with a very small TIDE search result.""" diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 51d9a3c9..c06ec788 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -221,7 +221,7 @@ def test_calc_match_score(): assert np.sum(masked_per_aa_scores.numpy()[3]) == 3 -def test_digest_fasta_cleave(tiny_fasta_file): +def test_digest_fasta_cleave(tiny_fasta_file, residues_dict): # No missed cleavages expected_normal = [ @@ -287,16 +287,18 @@ def test_digest_fasta_cleave(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected -def test_digest_fasta_mods(tiny_fasta_file): +def test_digest_fasta_mods(tiny_fasta_file, residues_dict): # 1 modification allowed # fixed: C+57.02146 # variable: 1M+15.994915,1N+0.984016,1Q+0.984016 @@ -360,10 +362,12 @@ def test_digest_fasta_mods(tiny_fasta_file): max_mods=1, precursor_tolerance=20, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) peptide_list = [ @@ -376,7 +380,7 @@ def test_digest_fasta_mods(tiny_fasta_file): assert peptide_list == expected_1mod -def test_length_restrictions(tiny_fasta_file): +def test_length_restrictions(tiny_fasta_file, residues_dict): # length between 20 and 50 expected_long = [ "MEAPAQLLFLLLLWLPDTTR", @@ -397,10 +401,12 @@ def test_length_restrictions(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_long @@ -415,16 +421,18 @@ def test_length_restrictions(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_short -def test_digest_fasta_enzyme(tiny_fasta_file): +def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): # arg-c enzyme expected_argc = [ "ATSIPAR", @@ -449,10 +457,12 @@ def test_digest_fasta_enzyme(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_argc @@ -467,16 +477,39 @@ def test_digest_fasta_enzyme(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_aspn + # Tesr regex rule instead of named enzyme + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="R", + digestion="full", + missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=50, + max_mods=0, + precursor_tolerance=20, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + ), + residues=residues_dict, + ) + peptide_list = list(pdb.db_peptides["peptide"]) + assert peptide_list == expected_argc + -def test_get_candidates(tiny_fasta_file): +def test_get_candidates(tiny_fasta_file, residues_dict): # precursor_window is 10000 expected_smallwindow = ["LLIYGASTR"] @@ -496,13 +529,15 @@ def test_get_candidates(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_smallwindow == candidates + assert expected_smallwindow == list(candidates) pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), @@ -514,13 +549,15 @@ def test_get_candidates(tiny_fasta_file): max_mods=0, precursor_tolerance=150000, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_midwindow == candidates + assert expected_midwindow == list(candidates) pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), @@ -532,16 +569,18 @@ def test_get_candidates(tiny_fasta_file): max_mods=0, precursor_tolerance=600000, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_widewindow == candidates + assert expected_widewindow == list(candidates) -def test_get_candidates_isotope_error(tiny_fasta_file): +def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): # Tide isotope error windows for 496.2, 2+: # 0: [980.481617, 1000.289326] @@ -598,14 +637,16 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) pdb.db_peptides = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_isotope0 == candidates + assert expected_isotope0 == list(candidates) pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), @@ -617,14 +658,16 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0, 1], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) pdb.db_peptides = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_isotope01 == candidates + assert expected_isotope01 == list(candidates) pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), @@ -636,14 +679,16 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0, 2], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) pdb.db_peptides = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_isotope012 == candidates + assert expected_isotope012 == list(candidates) pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), @@ -655,14 +700,16 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0, 3], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) pdb.db_peptides = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_isotope0123 == candidates + assert expected_isotope0123 == list(candidates) def test_beam_search_decode(): From 4a5b238133aaa1db27f584f52d9328b2f90c35f4 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 3 Sep 2024 10:29:23 -0700 Subject: [PATCH 35/84] minor changes regarding formatting and small efficiency boosts --- casanovo/config.yaml | 8 +++--- casanovo/data/db_utils.py | 52 ++++++++++++++++++++-------------- casanovo/denovo/dataloaders.py | 13 ++++----- casanovo/denovo/model.py | 2 +- 4 files changed, 42 insertions(+), 33 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index af2f79d1..17cba6a4 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -46,23 +46,23 @@ devices: # Enzyme for in silico digestion, used to generate candidate peptides. # See pyteomics.parser.expasy_rules for valid enzymes. +# Can also take a regex expression to specify custom digestion rules. enzyme: "trypsin" # Digestion type for candidate peptide generation. # full: standard digestion. semi: Include products of semi-specific cleavage. -# Can also take a regex expression to specify custom digestion rules. digestion: "full" # Number of allowed missed cleavages when digesting protein. missed_cleavages: 0 -# Maximum number of amino acid modifications per peptide, +# Maximum number of variable amino acid modifications per peptide, # None generates all possible isoforms as candidates. max_mods: 1 # Select which modifications from the vocabulary can be used in candidate creation. # Format: Comma-separated list of "aa:mod_residue", -# where aa is a standard amino acid or "X" for an N-terminal mod +# where aa is a standard amino acid or "nterm" for an N-terminal mod # and mod_residue is a key from the "residues" dictionary. # Example: "M:M+15.995,X:+43.006-17.027" allowed_fixed_mods: "C:C+57.021" -allowed_var_mods: "M:M+15.995,N:N+0.984,Q:Q+0.984,X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" +allowed_var_mods: "M:M+15.995,N:N+0.984,Q:Q+0.984,nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ### diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index c9201538..86c2112d 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -5,6 +5,7 @@ import os import re import string +from collections import defaultdict from typing import List, Tuple import depthcharge.masses @@ -13,6 +14,7 @@ import pyteomics.parser as parser from numba import njit + logger = logging.getLogger("casanovo") # CONSTANTS @@ -72,6 +74,9 @@ def __init__( self.fixed_mods, self.var_mods, self.swap_map = _construct_mods_dict( allowed_fixed_mods, allowed_var_mods ) + self.swap_regex = re.compile( + "(%s)" % "|".join(map(re.escape, self.swap_map.keys())) + ) self.db_peptides = self._digest_fasta( fasta_path, enzyme, @@ -167,6 +172,7 @@ def _digest_fasta( enzyme : str The enzyme to use for digestion. See pyteomics.parser.expasy_rules for valid enzymes. + Can also be a regex pattern. digestion : str The type of digestion to perform. Either 'full' or 'partial'. missed_cleavages : int @@ -199,9 +205,7 @@ def _digest_fasta( enzyme, ) semi = digestion == "partial" - valid_aa = set( - [re.sub(r"[^A-Z]+", "", res) for res in self.residues.keys()] - ) + valid_aa = set(list(self.residues.keys()) + ["C"]) for header, seq in fasta.read(fasta_filename): pep_set = parser.cleave( seq, @@ -212,17 +216,16 @@ def _digest_fasta( protein = header.split()[0] for pep in pep_set: if ( - len(pep) < min_peptide_length - or len(pep) > max_peptide_length + len(pep) >= min_peptide_length + or len(pep) <= max_peptide_length ): - continue - - if any(aa not in valid_aa for aa in pep): - logger.warn( - "Skipping peptide with unknown amino acids: %s", pep - ) - continue - peptide_list.append((pep, protein)) + if any(aa not in valid_aa for aa in pep): + logger.warn( + "Skipping peptide with unknown amino acids: %s", + pep, + ) + else: + peptide_list.append((pep, protein)) # Generate modified peptides mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") @@ -242,7 +245,11 @@ def _digest_fasta( (mod_pep, mass_calculator.mass(mod_pep), prot) for isos, prot in peptide_isoforms for mod_pep in map( - functools.partial(_convert_from_modx, swap_map=self.swap_map), + functools.partial( + _convert_from_modx, + swap_map=self.swap_map, + swap_regex=self.swap_regex, + ), isos, ) ] @@ -259,7 +266,7 @@ def _digest_fasta( @njit -def _to_mz(precursor_mass, charge): +def _to_mz(precursor_mass: float, charge: int) -> float: """ Convert precursor neutral mass to m/z value. @@ -279,7 +286,7 @@ def _to_mz(precursor_mass, charge): @njit -def _to_raw_mass(mz_mass, charge): +def _to_raw_mass(mz_mass: float, charge: int) -> float: """ Convert precursor m/z value to neutral mass. @@ -298,7 +305,7 @@ def _to_raw_mass(mz_mass, charge): return charge * (mz_mass - PROTON) -def _convert_from_modx(seq: str, swap_map: dict) -> str: +def _convert_from_modx(seq: str, swap_map: dict, swap_regex: str) -> str: """Converts peptide sequence from modX format to Casanovo-acceptable modifications. Args: @@ -306,12 +313,15 @@ def _convert_from_modx(seq: str, swap_map: dict) -> str: Peptide in modX format swap_map : dict Dictionary that allows for swapping of modX to Casanovo-acceptable modifications. + swap_regex : str + Regular expression to match modX format. """ - regex = re.compile("(%s)" % "|".join(map(re.escape, swap_map.keys()))) - return regex.sub(lambda x: swap_map[x.group()], seq) + return swap_regex.sub(lambda x: swap_map[x.group()], seq) -def _construct_mods_dict(allowed_fixed_mods, allowed_var_mods): +def _construct_mods_dict( + allowed_fixed_mods: str, allowed_var_mods: str +) -> Tuple[dict, dict, dict]: """ Constructs dictionaries of fixed and variable modifications. @@ -343,7 +353,7 @@ def _construct_mods_dict(allowed_fixed_mods, allowed_var_mods): for idx, mod in enumerate(allowed_var_mods.split(",")): aa, mod_aa = mod.split(":") mod_id = string.ascii_lowercase[idx] - if aa == "X": + if aa == "nterm": var_mods[f"{mod_id}-"] = True swap_map[f"{mod_id}-"] = f"{mod_aa}" else: diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 6e8c93b3..4793e2f3 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -272,11 +272,11 @@ def prepare_psm_batch( spectra, precursor_mzs, precursor_charges, spectrum_ids = list(zip(*batch)) spectra = torch.nn.utils.rnn.pad_sequence(spectra, batch_first=True) - precursor_mzs = torch.tensor(precursor_mzs) - precursor_charges = torch.tensor(precursor_charges) - precursor_masses = (precursor_mzs - 1.007276) * precursor_charges + precursor_mzs_t = torch.tensor(precursor_mzs) + precursor_charges_t = torch.tensor(precursor_charges) + precursor_masses_t = (precursor_mzs_t - 1.007276) * precursor_charges_t precursors = torch.vstack( - [precursor_masses, precursor_charges, precursor_mzs] + [precursor_masses_t, precursor_charges_t, precursor_mzs_t] ).T.float() all_spectra = [] @@ -286,8 +286,8 @@ def prepare_psm_batch( all_proteins = [] for idx in range(len(batch)): spec_peptides, spec_proteins = protein_database.get_candidates( - precursor_mzs[idx].type(torch.float64).item(), - precursor_charges[idx].type(torch.int64).item(), + precursor_mzs[idx], + precursor_charges[idx], ) try: all_spectra.append( @@ -303,7 +303,6 @@ def prepare_psm_batch( logger.warning( "No candidates found for spectrum %s", spectrum_ids[idx] ) - continue return ( torch.cat(all_spectra, dim=0), diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index dc7e5f7b..31757d81 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1009,7 +1009,7 @@ class DbSpec2Pep(Spec2Pep): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.psm_batch_size = 1024 + self.psm_batch_size = None def predict_step(self, batch, *args): """ From 4352bbdfb41aeeb61675c9a290f7bc83eae2f717 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 3 Sep 2024 11:24:18 -0700 Subject: [PATCH 36/84] changes before reformatting config --- casanovo/data/db_utils.py | 21 +++++++++++++-------- tests/conftest.py | 2 +- tests/unit_tests/test_unit.py | 28 ++++++++++++++-------------- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 86c2112d..26f7152c 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -77,7 +77,7 @@ def __init__( self.swap_regex = re.compile( "(%s)" % "|".join(map(re.escape, self.swap_map.keys())) ) - self.db_peptides = self._digest_fasta( + self.db_peptides, self.prot_map = self._digest_fasta( fasta_path, enzyme, digestion, @@ -146,11 +146,9 @@ def get_associated_protein(self, peptide: str) -> str: Returns ------- protein : str - The associated protein. + The associated protein(s). """ - return self.db_peptides[self.db_peptides["peptide"] == peptide][ - "protein" - ].values[0] + return ",".join(self.prot_map[peptide]) def _digest_fasta( self, @@ -186,9 +184,11 @@ def _digest_fasta( Returns ------- - mod_peptide_list : pd.DataFrame + pep_table : pd.DataFrame A Pandas DataFrame with peptide, mass, and protein columns. Sorted by neutral mass in ascending order. + prot_map : dict + A dictionary mapping peptides to associated proteins. """ # Verify the existence of the file: if not os.path.isfile(fasta_filename): @@ -217,7 +217,7 @@ def _digest_fasta( for pep in pep_set: if ( len(pep) >= min_peptide_length - or len(pep) <= max_peptide_length + and len(pep) <= max_peptide_length ): if any(aa not in valid_aa for aa in pep): logger.warn( @@ -259,10 +259,15 @@ def _digest_fasta( ) pep_table.sort_values(by=["calc_mass", "peptide"], inplace=True) + # Create a dictionary mapping for easy accession of associated proteins + prot_map = defaultdict(list) + for pep, _, prot in mod_peptide_list: + prot_map[pep].append(prot) + logger.info( "Digestion complete. %d peptides generated.", len(pep_table) ) - return pep_table + return pep_table, prot_map @njit diff --git a/tests/conftest.py b/tests/conftest.py index 3b94896a..bf02a3ab 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -300,7 +300,7 @@ def tiny_config(tmp_path): "allowed_fixed_mods": "C:C+57.021", "allowed_var_mods": ( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), } diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index c06ec788..d03d6f7f 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -290,7 +290,7 @@ def test_digest_fasta_cleave(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -365,7 +365,7 @@ def test_digest_fasta_mods(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -404,7 +404,7 @@ def test_length_restrictions(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -424,7 +424,7 @@ def test_length_restrictions(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -460,7 +460,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -480,7 +480,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -501,7 +501,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -532,7 +532,7 @@ def test_get_candidates(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -552,7 +552,7 @@ def test_get_candidates(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -572,7 +572,7 @@ def test_get_candidates(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -640,7 +640,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -661,7 +661,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -682,7 +682,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -703,7 +703,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) From ddff67fb03b06d3b27f73ff58dfdd478cd8a826b Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 3 Sep 2024 12:00:28 -0700 Subject: [PATCH 37/84] replace all occurences of "max_length" with "max_peptide_len" --- casanovo/config.py | 2 +- casanovo/config.yaml | 2 +- casanovo/denovo/model.py | 26 +++++++++++++------------- casanovo/denovo/model_runner.py | 6 +++--- tests/conftest.py | 2 +- tests/unit_tests/test_unit.py | 10 +++++----- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/casanovo/config.py b/casanovo/config.py index 792da35a..8577d087 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -59,7 +59,7 @@ class Config: n_layers=int, dropout=float, dim_intensity=int, - max_length=int, + max_peptide_len=int, residues=dict, n_log=int, tb_summarywriter=str, diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 17cba6a4..e8732b20 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -18,7 +18,7 @@ isotope_error_range: [0, 1] # The minimum length of considered peptides. min_peptide_len: 6 # The maximum length of considered peptides. -max_length: 100 +max_peptide_len: 100 # Number of spectra in one inference batch. predict_batch_size: 1024 diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 31757d81..6fe34bfa 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -46,7 +46,7 @@ class Spec2Pep(pl.LightningModule, ModelMixin): (``dim_model - dim_intensity``) are reserved for encoding the m/z value. If ``None``, the intensity will be projected up to ``dim_model`` using a linear layer, then summed with the m/z encoding for each peak. - max_length : int + max_peptide_len : int The maximum peptide length to decode. residues : Union[Dict[str, float], str] The amino acid dictionary and their masses. By default ("canonical) this @@ -99,7 +99,7 @@ def __init__( n_layers: int = 9, dropout: float = 0.0, dim_intensity: Optional[int] = None, - max_length: int = 100, + max_peptide_len: int = 100, residues: Union[Dict[str, float], str] = "canonical", max_charge: int = 5, precursor_mass_tol: float = 50, @@ -158,7 +158,7 @@ def __init__( self.opt_kwargs = kwargs # Data properties. - self.max_length = max_length + self.max_peptide_len = max_peptide_len self.residues = residues self.precursor_mass_tol = precursor_mass_tol self.isotope_error_range = isotope_error_range @@ -241,7 +241,7 @@ def beam_search_decode( # Sizes. batch = spectra.shape[0] # B - length = self.max_length + 1 # L + length = self.max_peptide_len + 1 # L vocab = self.decoder.vocab_size + 1 # V beam = self.n_beams # S @@ -269,7 +269,7 @@ def beam_search_decode( scores = einops.rearrange(scores, "B L V S -> (B S) L V") # The main decoding loop. - for step in range(0, self.max_length): + for step in range(0, self.max_peptide_len): # Terminate beams exceeding the precursor m/z tolerance and track # all finished beams (either terminated or stop token predicted). ( @@ -323,10 +323,10 @@ def _finish_beams( Parameters ---------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_length, n_amino_acids) + (n_spectra * n_beams, max_peptide_len, n_amino_acids) Scores for the predicted amino acid tokens for all beams and all spectra. step : int @@ -491,10 +491,10 @@ def _cache_finished_beams( Parameters ---------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_length, n_amino_acids) + (n_spectra * n_beams, max_peptide_len, n_amino_acids) Scores for the predicted amino acid tokens for all beams and all spectra. step : int @@ -576,10 +576,10 @@ def _get_topk_beams( Parameters ---------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_length, n_amino_acids) + (n_spectra * n_beams, max_peptide_len, n_amino_acids) Scores for the predicted amino acid tokens for all beams and all spectra. finished_beams : torch.Tensor of shape (n_spectra * n_beams) @@ -592,10 +592,10 @@ def _get_topk_beams( Returns ------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_length, n_amino_acids) + (n_spectra * n_beams, max_peptide_len, n_amino_acids) Scores for the predicted amino acid tokens for all beams and all spectra. """ diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 395320e5..efb380cb 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -116,7 +116,7 @@ def db_search( self.config.digestion, self.config.missed_cleavages, self.config.min_peptide_len, - self.config.max_length, + self.config.max_peptide_len, self.config.max_mods, self.config.precursor_mass_tol, self.config.isotope_error_range, @@ -271,7 +271,7 @@ def initialize_model( n_layers=self.config.n_layers, dropout=self.config.dropout, dim_intensity=self.config.dim_intensity, - max_length=self.config.max_length, + max_peptide_len=self.config.max_peptide_len, residues=self.config.residues, max_charge=self.config.max_charge, precursor_mass_tol=self.config.precursor_mass_tol, @@ -292,7 +292,7 @@ def initialize_model( # Reconfigurable non-architecture related parameters for a loaded model. loaded_model_params = dict( - max_length=self.config.max_length, + max_peptide_len=self.config.max_peptide_len, precursor_mass_tol=self.config.precursor_mass_tol, isotope_error_range=self.config.isotope_error_range, n_beams=self.config.n_beams, diff --git a/tests/conftest.py b/tests/conftest.py index bf02a3ab..95ef2d02 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -241,7 +241,7 @@ def tiny_config(tmp_path): "precursor_mass_tol": 5, "isotope_error_range": [0, 1], "min_peptide_len": 6, - "max_length": 100, + "max_peptide_len": 100, "enzyme": "trypsin", "digestion": "full", "missed_cleavages": 0, diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index d03d6f7f..63d492f8 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -722,7 +722,7 @@ def test_beam_search_decode(): # Sizes. batch = 1 # B - length = model.max_length + 1 # L + length = model.max_peptide_len + 1 # L vocab = model.decoder.vocab_size + 1 # V beam = model.n_beams # S step = 3 @@ -839,12 +839,12 @@ def test_beam_search_decode(): assert torch.equal(new_scores[:, step, :], expected_scores) # Test output if decoding loop isn't stopped with termination of all beams. - model.max_length = 0 + model.max_peptide_len = 0 # 1 spectrum with 5 peaks (2 values: m/z and intensity). spectra = torch.zeros(1, 5, 2) precursors = torch.tensor([[469.25364, 2.0, 235.63410]]) assert len(list(model.beam_search_decode(spectra, precursors))[0]) == 0 - model.max_length = 100 + model.max_peptide_len = 100 # Re-initialize scores and tokens to further test caching functionality. scores = torch.full( @@ -1004,7 +1004,7 @@ def test_beam_search_decode(): batch = 2 # B beam = model.n_beams # S model.decoder.reverse = True - length = model.max_length + 1 # L + length = model.max_peptide_len + 1 # L vocab = model.decoder.vocab_size + 1 # V step = 4 @@ -1045,7 +1045,7 @@ def test_beam_search_decode(): batch = 2 # B beam = model.n_beams # S model.decoder.reverse = True - length = model.max_length + 1 # L + length = model.max_peptide_len + 1 # L vocab = model.decoder.vocab_size + 1 # V step = 4 From a3548d00124c1242350a62fdbcb2f719484254fe Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 3 Sep 2024 13:37:46 -0700 Subject: [PATCH 38/84] added nonspecific digestion --- casanovo/config.py | 1 + casanovo/config.yaml | 4 +- casanovo/data/db_utils.py | 67 +++++++----- tests/unit_tests/test_unit.py | 185 ++++++++++++++++++++++++++++++++-- 4 files changed, 225 insertions(+), 32 deletions(-) diff --git a/casanovo/config.py b/casanovo/config.py index 8577d087..dc2a3d2c 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -18,6 +18,7 @@ _config_deprecated = dict( every_n_train_steps="val_check_interval", max_iters="cosine_schedule_period_iters", + max_length="max_peptide_len", ) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index e8732b20..df6fa8bb 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -49,7 +49,9 @@ devices: # Can also take a regex expression to specify custom digestion rules. enzyme: "trypsin" # Digestion type for candidate peptide generation. -# full: standard digestion. semi: Include products of semi-specific cleavage. +# full: standard digestion. +# semi: Include products of semi-specific cleavage. +# non-specific: Include products of non-specific cleavage. digestion: "full" # Number of allowed missed cleavages when digesting protein. missed_cleavages: 0 diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 26f7152c..f9c669ed 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -172,7 +172,7 @@ def _digest_fasta( See pyteomics.parser.expasy_rules for valid enzymes. Can also be a regex pattern. digestion : str - The type of digestion to perform. Either 'full' or 'partial'. + The type of digestion to perform. Either 'full', 'partial' or 'non-specific'. missed_cleavages : int The number of missed cleavages to allow. max_mods : int @@ -196,7 +196,7 @@ def _digest_fasta( raise FileNotFoundError(f"File {fasta_filename} does not exist.") peptide_list = [] - if digestion not in ["full", "partial"]: + if digestion not in ["full", "partial", "non-specific"]: logger.error("Digestion type %s not recognized.", digestion) raise ValueError(f"Digestion type {digestion} not recognized.") if enzyme not in parser.expasy_rules: @@ -204,28 +204,49 @@ def _digest_fasta( "Enzyme %s not recognized. Interpreting as cleavage rule.", enzyme, ) - semi = digestion == "partial" valid_aa = set(list(self.residues.keys()) + ["C"]) - for header, seq in fasta.read(fasta_filename): - pep_set = parser.cleave( - seq, - rule=enzyme, - missed_cleavages=missed_cleavages, - semi=semi, - ) - protein = header.split()[0] - for pep in pep_set: - if ( - len(pep) >= min_peptide_length - and len(pep) <= max_peptide_length - ): - if any(aa not in valid_aa for aa in pep): - logger.warn( - "Skipping peptide with unknown amino acids: %s", - pep, - ) - else: - peptide_list.append((pep, protein)) + if digestion == "non-specific": + for header, seq in fasta.read(fasta_filename): + pep_set = [] + # Generate all possible peptides + for i in range(len(seq)): + for j in range(i + 1, len(seq) + 1): + pep_set.append(seq[i:j]) + protein = header.split()[0] + for pep in pep_set: + if ( + len(pep) >= min_peptide_length + and len(pep) <= max_peptide_length + ): + if any(aa not in valid_aa for aa in pep): + logger.warn( + "Skipping peptide with unknown amino acids: %s", + pep, + ) + else: + peptide_list.append((pep, protein)) + else: + semi = digestion == "partial" + for header, seq in fasta.read(fasta_filename): + pep_set = parser.cleave( + seq, + rule=enzyme, + missed_cleavages=missed_cleavages, + semi=semi, + ) + protein = header.split()[0] + for pep in pep_set: + if ( + len(pep) >= min_peptide_length + and len(pep) <= max_peptide_length + ): + if any(aa not in valid_aa for aa in pep): + logger.warn( + "Skipping peptide with unknown amino acids: %s", + pep, + ) + else: + peptide_list.append((pep, protein)) # Generate modified peptides mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 63d492f8..594552af 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -327,12 +327,16 @@ def test_digest_fasta_mods(tiny_fasta_file, residues_dict): "+42.011EIVMTQSPPTLSLSPGER", "+43.006EIVMTQSPPTLSLSPGER", "-17.027MEAPAQLLFLLLLWLPDTTR", + "-17.027M+15.995EAPAQLLFLLLLWLPDTTR", # "MEAPAQLLFLLLLWLPDTTR", "MEAPAQ+0.984LLFLLLLWLPDTTR", "M+15.995EAPAQLLFLLLLWLPDTTR", "+43.006-17.027MEAPAQLLFLLLLWLPDTTR", + "+43.006-17.027M+15.995EAPAQLLFLLLLWLPDTTR", # "+42.011MEAPAQLLFLLLLWLPDTTR", "+43.006MEAPAQLLFLLLLWLPDTTR", + "+42.011M+15.995EAPAQLLFLLLLWLPDTTR", # + "+43.006M+15.995EAPAQLLFLLLLWLPDTTR", # "-17.027ASQSVSSSYLTWYQQKPGQAPR", "ASQSVSSSYLTWYQQKPGQAPR", "ASQ+0.984SVSSSYLTWYQQKPGQAPR", @@ -370,13 +374,6 @@ def test_digest_fasta_mods(tiny_fasta_file, residues_dict): residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) - peptide_list = [ - x - for x in peptide_list - if not re.search( - r"(\+42\.011|\+43\.006|\-17\.027|\+43\.006\-17\.027)+[A-Z]\+", x - ) - ] assert peptide_list == expected_1mod @@ -447,6 +444,136 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): # asp-n enzyme expected_aspn = ["DFAVYYC+57.021QQ", "DFTLTISSLQPE", "MEAPAQLLFLLLLWLP"] + expected_semispecific = [ + "FSGSGS", + "ATSIPA", + "ASQSVS", + "PGQAPR", + "TSIPAR", + "MEAPAQ", + "LLIYGA", + "YGASTR", + "LSPGER", + "LPDTTR", + "EIVMTQ", + "VTLSC+57.021R", + "QDYNLP", + ] + + expected_nonspecific = [ + "SGSGSG", + "GSGSGT", + "SGSGTD", + "FSGSGS", + "ATSIPA", + "GASTRA", + "LSLSPG", + "ASQSVS", + "GSGTDF", + "SLSPGE", + "QSVSSS", + "SQSVSS", + "KPGQAP", + "SPPTLS", + "ASTRAT", + "RFSGSG", + "IYGAST", + "APAQLL", + "PTLSLS", + "TLSLSP", + "TLTISS", + "STRATS", + "LIYGAS", + "ARFSGS", + "PGQAPR", + "SGTDFT", + "PPTLSL", + "EAPAQL", + "QKPGQA", + "SVSSSY", + "TQSPPT", + "LTISSL", + "PARFSG", + "GQAPRL", + "QSPPTL", + "SPGERV", + "ISSLQP", + "RATSIP", + "TSIPAR", + "MEAPAQ", + "RASQSV", + "TISSLQ", + "TRATSI", + "LLIYGA", + "GTDFTL", + "YGASTR", + "VSSSYL", + "SSSYLT", + "LSPGER", + "PGERVT", + "MTQSPP", + "SSLQPE", + "VMTQSP", + "GERVTL", + "PEDFAV", + "IVMTQS", + "FTLTIS", + "APRLLI", + "QQKPGQ", + "SLQPED", + "PAQLLF", + "IPARFS", + "SIPARF", + "LSC+57.021RAS", + "TDFTLT", + "QAPRLL", + "LPDTTR", + "ERVTLS", + "AQLLFL", + "QPEDFA", + "TLSC+57.021RA", + "C+57.021RASQS", + "SC+57.021RASQ", + "DFTLTI", + "PDTTRE", + "TTREIV", + "EIVMTQ", + "YQQKPG", + "LFLLLL", + "LLFLLL", + "WLPDTT", + "DTTREI", + "RLLIYG", + "RVTLSC+57.021", + "VTLSC+57.021R", + "EDFAVY", + "LWLPDT", + "QLLFLL", + "LQPEDF", + "REIVMT", + "TREIVM", + "QDYNLP", + "LLLWLP", + "SSYLTW", + "LLWLPD", + "LLLLWL", + "PRLLIY", + "DFAVYY", + "QQDYNL", + "AVYYC+57.021Q", + "FLLLLW", + "FAVYYC+57.021", + "C+57.021QQDYN", + "SYLTWY", + "LTWYQQ", + "WYQQKP", + "TWYQQK", + "VYYC+57.021QQ", + "YLTWYQ", + "YC+57.021QQDY", + "YYC+57.021QQD", + ] + pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), enzyme="arg-c", @@ -487,7 +614,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_aspn - # Tesr regex rule instead of named enzyme + # Test regex rule instead of named enzyme pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), enzyme="R", @@ -508,6 +635,48 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_argc + # Test semispecific digest + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="partial", + missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=6, + max_mods=0, + precursor_tolerance=10000, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + peptide_list = list(pdb.db_peptides["peptide"]) + assert peptide_list == expected_semispecific + + # Test nonspecific digest + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="non-specific", + missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=6, + max_mods=0, + precursor_tolerance=10000, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + peptide_list = list(pdb.db_peptides["peptide"]) + assert peptide_list == expected_nonspecific + def test_get_candidates(tiny_fasta_file, residues_dict): # precursor_window is 10000 From e8d4682241b9b4d10384e9dfd92fd04258103e3e Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Fri, 13 Sep 2024 12:06:31 -0700 Subject: [PATCH 39/84] minor comments --- casanovo/data/db_utils.py | 35 +++++++++++++++++----------------- casanovo/denovo/dataloaders.py | 7 +------ casanovo/denovo/model.py | 6 ++---- 3 files changed, 20 insertions(+), 28 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index f9c669ed..19b312e2 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -127,12 +127,12 @@ def get_candidates( (self.db_peptides["calc_mass"] >= lower_bound) & (self.db_peptides["calc_mass"] <= upper_bound) ] - candidates.append(window[["peptide", "calc_mass", "protein"]]) + candidates.append(window[["peptide", "calc_mass"]]) candidates = pd.concat(candidates) candidates.drop_duplicates(inplace=True) candidates.sort_values(by=["calc_mass", "peptide"], inplace=True) - return candidates["peptide"], candidates["protein"] + return candidates["peptide"] def get_associated_protein(self, peptide: str) -> str: """ @@ -159,7 +159,7 @@ def _digest_fasta( max_mods: int, min_peptide_length: int, max_peptide_length: int, - ) -> pd.DataFrame: + ) -> Tuple[pd.DataFrame, dict]: """ Digests a FASTA file and returns the peptides, their masses, and associated protein. @@ -185,8 +185,8 @@ def _digest_fasta( Returns ------- pep_table : pd.DataFrame - A Pandas DataFrame with peptide, mass, - and protein columns. Sorted by neutral mass in ascending order. + A Pandas DataFrame with peptide and mass columns. + Sorted by neutral mass in ascending order. prot_map : dict A dictionary mapping peptides to associated proteins. """ @@ -207,17 +207,14 @@ def _digest_fasta( valid_aa = set(list(self.residues.keys()) + ["C"]) if digestion == "non-specific": for header, seq in fasta.read(fasta_filename): - pep_set = [] + protein = header.split()[0] # Generate all possible peptides for i in range(len(seq)): - for j in range(i + 1, len(seq) + 1): - pep_set.append(seq[i:j]) - protein = header.split()[0] - for pep in pep_set: - if ( - len(pep) >= min_peptide_length - and len(pep) <= max_peptide_length + for j in range( + i + min_peptide_length, + min(i + max_peptide_length + 1, len(seq) + 1), ): + pep = seq[i:j] if any(aa not in valid_aa for aa in pep): logger.warn( "Skipping peptide with unknown amino acids: %s", @@ -274,17 +271,19 @@ def _digest_fasta( isos, ) ] - # Create a DataFrame for easy sorting and filtering - pep_table = pd.DataFrame( - mod_peptide_list, columns=["peptide", "calc_mass", "protein"] - ) - pep_table.sort_values(by=["calc_mass", "peptide"], inplace=True) # Create a dictionary mapping for easy accession of associated proteins prot_map = defaultdict(list) for pep, _, prot in mod_peptide_list: prot_map[pep].append(prot) + # Create a DataFrame for easy sorting and filtering + pep_table = pd.DataFrame( + [(pep, mass) for pep, mass, _ in mod_peptide_list], + columns=["peptide", "calc_mass"], + ) + pep_table.sort_values(by=["calc_mass", "peptide"], inplace=True) + logger.info( "Digestion complete. %d peptides generated.", len(pep_table) ) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 4793e2f3..2646329d 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -266,8 +266,6 @@ def prepare_psm_batch( The spectrum identifiers. all_peptides : List[str] The candidate peptides for each spectrum. - all_proteins : List[str] - The proteins associated with each candidate peptide. """ spectra, precursor_mzs, precursor_charges, spectrum_ids = list(zip(*batch)) spectra = torch.nn.utils.rnn.pad_sequence(spectra, batch_first=True) @@ -283,9 +281,8 @@ def prepare_psm_batch( all_precursors = [] all_spectrum_ids = [] all_peptides = [] - all_proteins = [] for idx in range(len(batch)): - spec_peptides, spec_proteins = protein_database.get_candidates( + spec_peptides = protein_database.get_candidates( precursor_mzs[idx], precursor_charges[idx], ) @@ -298,7 +295,6 @@ def prepare_psm_batch( ) all_spectrum_ids.extend([spectrum_ids[idx]] * len(spec_peptides)) all_peptides.extend(spec_peptides) - all_proteins.extend(spec_proteins) except ValueError: logger.warning( "No candidates found for spectrum %s", spectrum_ids[idx] @@ -309,5 +305,4 @@ def prepare_psm_batch( torch.cat(all_precursors, dim=0), all_spectrum_ids, all_peptides, - all_proteins, ) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 6fe34bfa..ca5557fc 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1019,7 +1019,7 @@ def predict_step(self, batch, *args): ---------- batch : Tuple[torch.Tensor, torch.Tensor, np.array, List[str]] A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - spectrum identifiers, (iv) candidate peptides, (v) associated protein. + spectrum identifiers, (iv) candidate peptides Returns ------- @@ -1049,7 +1049,6 @@ def predict_step(self, batch, *args): peptide_score, aa_scores, peptide, - protein, ) in zip( current_batch[1][:, 1].cpu().detach().numpy(), current_batch[1][:, 2].cpu().detach().numpy(), @@ -1057,7 +1056,6 @@ def predict_step(self, batch, *args): all_scores.cpu().detach().numpy(), per_aa_scores.cpu().detach().numpy(), current_batch[3], - current_batch[4], ): predictions.append( ( @@ -1067,7 +1065,7 @@ def predict_step(self, batch, *args): peptide, peptide_score, aa_scores, - protein, + self.protein_database.get_associated_protein(peptide), ) ) return predictions From 68b6926032814dcc4a6b650e1736c8ff92edf7cb Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Fri, 13 Sep 2024 13:41:39 -0700 Subject: [PATCH 40/84] full branch comments addressed --- casanovo/data/db_utils.py | 197 ++++++++++++++++++++-------------- tests/unit_tests/test_unit.py | 14 +-- 2 files changed, 123 insertions(+), 88 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 19b312e2..34671eb1 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -6,7 +6,7 @@ import re import string from collections import defaultdict -from typing import List, Tuple +from typing import List, Tuple, Iterator import depthcharge.masses import pandas as pd @@ -70,22 +70,23 @@ def __init__( allowed_var_mods: str, residues: dict, ): - self.residues = residues self.fixed_mods, self.var_mods, self.swap_map = _construct_mods_dict( allowed_fixed_mods, allowed_var_mods ) + self.max_mods = max_mods self.swap_regex = re.compile( "(%s)" % "|".join(map(re.escape, self.swap_map.keys())) ) - self.db_peptides, self.prot_map = self._digest_fasta( + peptide_generator = _peptide_generator( fasta_path, enzyme, digestion, missed_cleavages, - max_mods, min_peptide_len, max_peptide_len, + set(list(residues.keys()) + ["C"]), ) + self.db_peptides, self.prot_map = self._digest_fasta(peptide_generator) self.precursor_tolerance = precursor_tolerance self.isotope_error = isotope_error @@ -152,35 +153,15 @@ def get_associated_protein(self, peptide: str) -> str: def _digest_fasta( self, - fasta_filename: str, - enzyme: str, - digestion: str, - missed_cleavages: int, - max_mods: int, - min_peptide_length: int, - max_peptide_length: int, + peptide_generator: Iterator[Tuple[str, str]], ) -> Tuple[pd.DataFrame, dict]: """ Digests a FASTA file and returns the peptides, their masses, and associated protein. Parameters ---------- - fasta_filename : str - Path to the FASTA file. - enzyme : str - The enzyme to use for digestion. - See pyteomics.parser.expasy_rules for valid enzymes. - Can also be a regex pattern. - digestion : str - The type of digestion to perform. Either 'full', 'partial' or 'non-specific'. - missed_cleavages : int - The number of missed cleavages to allow. - max_mods : int - The maximum number of modifications to allow per peptide. - min_peptide_length : int - The minimum length of peptides to consider. - max_peptide_length : int - The maximum length of peptides to consider. + peptide_generator : Iterator[Tuple[str, str]] + An iterator that yields peptides and associated proteins. Returns ------- @@ -190,60 +171,9 @@ def _digest_fasta( prot_map : dict A dictionary mapping peptides to associated proteins. """ - # Verify the existence of the file: - if not os.path.isfile(fasta_filename): - logger.error("File %s does not exist.", fasta_filename) - raise FileNotFoundError(f"File {fasta_filename} does not exist.") - peptide_list = [] - if digestion not in ["full", "partial", "non-specific"]: - logger.error("Digestion type %s not recognized.", digestion) - raise ValueError(f"Digestion type {digestion} not recognized.") - if enzyme not in parser.expasy_rules: - logger.info( - "Enzyme %s not recognized. Interpreting as cleavage rule.", - enzyme, - ) - valid_aa = set(list(self.residues.keys()) + ["C"]) - if digestion == "non-specific": - for header, seq in fasta.read(fasta_filename): - protein = header.split()[0] - # Generate all possible peptides - for i in range(len(seq)): - for j in range( - i + min_peptide_length, - min(i + max_peptide_length + 1, len(seq) + 1), - ): - pep = seq[i:j] - if any(aa not in valid_aa for aa in pep): - logger.warn( - "Skipping peptide with unknown amino acids: %s", - pep, - ) - else: - peptide_list.append((pep, protein)) - else: - semi = digestion == "partial" - for header, seq in fasta.read(fasta_filename): - pep_set = parser.cleave( - seq, - rule=enzyme, - missed_cleavages=missed_cleavages, - semi=semi, - ) - protein = header.split()[0] - for pep in pep_set: - if ( - len(pep) >= min_peptide_length - and len(pep) <= max_peptide_length - ): - if any(aa not in valid_aa for aa in pep): - logger.warn( - "Skipping peptide with unknown amino acids: %s", - pep, - ) - else: - peptide_list.append((pep, protein)) + for pep, prot in peptide_generator: + peptide_list.append((pep, prot)) # Generate modified peptides mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") @@ -253,7 +183,7 @@ def _digest_fasta( pep, variable_mods=self.var_mods, fixed_mods=self.fixed_mods, - max_mods=max_mods, + max_mods=self.max_mods, ), prot, ) @@ -290,6 +220,111 @@ def _digest_fasta( return pep_table, prot_map +def _peptide_generator( + fasta_filename: str, + enzyme: str, + digestion: str, + missed_cleavages: int, + min_peptide_length: int, + max_peptide_length: int, + valid_aa: set[str], +) -> Iterator[str]: + """ + Create a generator the yields peptides from a FASTA file + depending on the type of digestion specified. + + Parameters + ---------- + fasta_filename : str + Path to the FASTA file. + enzyme : str + The enzyme to use for digestion. + See pyteomics.parser.expasy_rules for valid enzymes. + Can also be a regex pattern. + digestion : str + The type of digestion to perform. Either 'full', 'partial' or 'non-specific'. + missed_cleavages : int + The number of missed cleavages to allow. + min_peptide_length : int + The minimum length of peptides to consider. + max_peptide_length : int + The maximum length of peptides to consider. + valid_aa : set[str] + A set of valid amino acids. + + Yields + ------ + pep : str + A peptide sequence, unmodified. + protein : str + The associated protein. + """ + # Verify the existence of the file: + if not os.path.isfile(fasta_filename): + logger.error("File %s does not exist.", fasta_filename) + raise FileNotFoundError(f"File {fasta_filename} does not exist.") + if digestion not in ["full", "partial", "non-specific"]: + logger.error("Digestion type %s not recognized.", digestion) + raise ValueError(f"Digestion type {digestion} not recognized.") + if enzyme not in parser.expasy_rules: + logger.info( + "Enzyme %s not recognized. Interpreting as cleavage rule.", + enzyme, + ) + + # Verify the existence of the file: + if not os.path.isfile(fasta_filename): + logger.error("File %s does not exist.", fasta_filename) + raise FileNotFoundError(f"File {fasta_filename} does not exist.") + if digestion not in ["full", "partial", "non-specific"]: + logger.error("Digestion type %s not recognized.", digestion) + raise ValueError(f"Digestion type {digestion} not recognized.") + if enzyme not in parser.expasy_rules: + logger.info( + "Enzyme %s not recognized. Interpreting as cleavage rule.", + enzyme, + ) + if digestion == "non-specific": + for header, seq in fasta.read(fasta_filename): + protein = header.split()[0] + # Generate all possible peptides + for i in range(len(seq)): + for j in range( + i + min_peptide_length, + min(i + max_peptide_length + 1, len(seq) + 1), + ): + pep = seq[i:j] + if any(aa not in valid_aa for aa in pep): + logger.warn( + "Skipping peptide with unknown amino acids: %s", + pep, + ) + else: + yield pep, protein + else: + semi = digestion == "partial" + for header, seq in fasta.read(fasta_filename): + pep_set = parser.cleave( + seq, + rule=enzyme, + missed_cleavages=missed_cleavages, + semi=semi, + ) + protein = header.split()[0] + for pep in pep_set: + if ( + len(pep) >= min_peptide_length + and len(pep) <= max_peptide_length + ): + if any(aa not in valid_aa for aa in pep): + logger.warn( + "Skipping peptide with unknown amino acids: %s", + pep, + ) + else: + yield pep, protein + + @njit def _to_mz(precursor_mass: float, charge: int) -> float: """ diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 594552af..a0b0935d 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -705,7 +705,7 @@ def test_get_candidates(tiny_fasta_file, residues_dict): ), residues=residues_dict, ) - candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_smallwindow == list(candidates) pdb = db_utils.ProteinDatabase( @@ -725,7 +725,7 @@ def test_get_candidates(tiny_fasta_file, residues_dict): ), residues=residues_dict, ) - candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_midwindow == list(candidates) pdb = db_utils.ProteinDatabase( @@ -745,7 +745,7 @@ def test_get_candidates(tiny_fasta_file, residues_dict): ), residues=residues_dict, ) - candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_widewindow == list(candidates) @@ -814,7 +814,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): residues=residues_dict, ) pdb.db_peptides = peptide_list - candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope0 == list(candidates) pdb = db_utils.ProteinDatabase( @@ -835,7 +835,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): residues=residues_dict, ) pdb.db_peptides = peptide_list - candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope01 == list(candidates) pdb = db_utils.ProteinDatabase( @@ -856,7 +856,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): residues=residues_dict, ) pdb.db_peptides = peptide_list - candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope012 == list(candidates) pdb = db_utils.ProteinDatabase( @@ -877,7 +877,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): residues=residues_dict, ) pdb.db_peptides = peptide_list - candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope0123 == list(candidates) From e8c9c7d3aba05f7466ac3e94ecdf7e5a2156fd7f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 13 Sep 2024 21:21:25 +0000 Subject: [PATCH 41/84] Generate new screengrabs with rich-codex --- docs/images/evaluate-help.svg | 2 +- docs/images/help.svg | 169 +++++++++++++++++----------------- docs/images/sequence-help.svg | 2 +- docs/images/train-help.svg | 2 +- 4 files changed, 89 insertions(+), 86 deletions(-) diff --git a/docs/images/evaluate-help.svg b/docs/images/evaluate-help.svg index ec9c23a3..661f0efe 100644 --- a/docs/images/evaluate-help.svg +++ b/docs/images/evaluate-help.svg @@ -31,7 +31,7 @@ font-weight: bold; font-family: arial; } - + .terminal-1819499677-r1 { fill: #c5c8c6 } .terminal-1819499677-r2 { fill: #d0b344 } .terminal-1819499677-r3 { fill: #c5c8c6;font-weight: bold } diff --git a/docs/images/help.svg b/docs/images/help.svg index bf0fbef8..5418b95a 100644 --- a/docs/images/help.svg +++ b/docs/images/help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + - + - + - - $ casanovo --help - -Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     - - ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  - ┃                                  Casanovo                                  ┃  - ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  - Casanovo de novo sequences peptides from tandem mass spectra using a            - Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   - de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  - training new models.                                                            - - Links:                                                                          - - • Documentation: https://casanovo.readthedocs.io                               - • Official code repository: https://github.com/Noble-Lab/casanovo              - - If you use Casanovo in your work, please cite:                                  - - • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   -mass spectrometry peptide sequencing with a transformer model. Proceedings   -of the 39th International Conference on Machine Learning - ICML '22 (2022)   -doi:10.1101/2022.02.07.479481.                                               - -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---help-h    Show this message and exit.                                     -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Commands ───────────────────────────────────────────────────────────────────╮ -configure Generate a Casanovo configuration file to customize.               -sequence  De novo sequence peptides from tandem mass spectra.                -train     Train a Casanovo model on your own data.                           -version   Get the Casanovo version information                               -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo --help + +Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     + + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  + ┃                                  Casanovo                                  ┃  + ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  + Casanovo de novo sequences peptides from tandem mass spectra using a            + Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   + de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  + training new models.                                                            + + Links:                                                                          + + • Documentation: https://casanovo.readthedocs.io                               + • Official code repository: https://github.com/Noble-Lab/casanovo              + + If you use Casanovo in your work, please cite:                                  + + • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   +mass spectrometry peptide sequencing with a transformer model. Proceedings   +of the 39th International Conference on Machine Learning - ICML '22 (2022)   +doi:10.1101/2022.02.07.479481.                                               + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--help-h    Show this message and exit.                                     +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Commands ───────────────────────────────────────────────────────────────────╮ +configure Generate a Casanovo configuration file to customize.               +db-search Perform a database search on MS/MS data using Casanovo-DB.         +sequence  De novo sequence peptides from tandem mass spectra.                +train     Train a Casanovo model on your own data.                           +version   Get the Casanovo version information                               +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg index 3c3d5c54..70570e2a 100644 --- a/docs/images/sequence-help.svg +++ b/docs/images/sequence-help.svg @@ -32,7 +32,7 @@ font-family: arial; } -.terminal-3834786767-r1 { fill: #c5c8c6 } + .terminal-3834786767-r1 { fill: #c5c8c6 } .terminal-3834786767-r2 { fill: #d0b344 } .terminal-3834786767-r3 { fill: #c5c8c6;font-weight: bold } .terminal-3834786767-r4 { fill: #68a0b3;font-weight: bold } diff --git a/docs/images/train-help.svg b/docs/images/train-help.svg index 8875b1c4..e27717e1 100644 --- a/docs/images/train-help.svg +++ b/docs/images/train-help.svg @@ -32,7 +32,7 @@ font-family: arial; } -.terminal-956334679-r1 { fill: #c5c8c6 } + .terminal-956334679-r1 { fill: #c5c8c6 } .terminal-956334679-r2 { fill: #d0b344 } .terminal-956334679-r3 { fill: #c5c8c6;font-weight: bold } .terminal-956334679-r4 { fill: #68a0b3;font-weight: bold } From e474eeec619728df9b076b49b4360264c5d5ad6d Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Fri, 13 Sep 2024 16:58:37 -0700 Subject: [PATCH 42/84] updated and fixed failed tests --- casanovo/data/ms_io.py | 6 +++++- casanovo/denovo/model.py | 20 ++++++++++---------- tests/conftest.py | 10 ++++++++-- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index 79143681..86f894ea 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -41,6 +41,9 @@ class PepSpecMatch: aa_scores : Iterable[float] A list of scores for individual amino acids in the peptide sequence, where len(aa_scores) == len(sequence) + protein : str + For DB-search mode, the protein from which the peptide + in the PSM was derived. Default value is "null". """ sequence: str @@ -50,6 +53,7 @@ class PepSpecMatch: calc_mz: float exp_mz: float aa_scores: Iterable[float] + protein: str = "null" class MztabWriter: @@ -228,7 +232,7 @@ def save(self) -> None: "PSM", psm.sequence, # sequence i, # PSM_ID - "null" if len(psm) < 8 else psm[7], # accession + psm.protein, # accession "null", # unique "null", # database "null", # database_version diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 61064e09..245514fe 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1088,16 +1088,16 @@ def on_predict_batch_end( protein, ) in outputs: self.out_writer.psms.append( - ( - peptide, - tuple(spectrum_i), - peptide_score, - charge, - precursor_mz, - self.peptide_mass_calculator.mass(peptide, charge), - ",".join(list(map("{:.5f}".format, aa_scores))), - protein, - ), + ms_io.PepSpecMatch( + sequence=peptide, + spectrum_id=tuple(spectrum_i), + peptide_score=peptide_score, + charge=int(charge), + calc_mz=precursor_mz, + exp_mz=self.peptide_mass_calculator.mass(peptide, charge), + aa_scores=aa_scores, + protein=protein, + ) ) diff --git a/tests/conftest.py b/tests/conftest.py index 3c286f7c..2674c4ae 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -42,6 +42,7 @@ def mgf_medium(tmp_path): return _create_mgf(peptides, mgf_file, mod_aa_mass={"C": 160.030649}) +@pytest.fixture def mgf_small_unannotated(tmp_path): """An MGF file with 2 unannotated spectra.""" peptides = ["LESLIEK", "PEPTIDEK"] @@ -49,7 +50,9 @@ def mgf_small_unannotated(tmp_path): return _create_mgf(peptides, mgf_file, annotate=False) -def _create_mgf(peptides, mgf_file, random_state=42, mod_aa_mass=None, annotate=True): +def _create_mgf( + peptides, mgf_file, random_state=42, mod_aa_mass=None, annotate=True +): """ Create a fake MGF file from one or more peptides. @@ -73,7 +76,10 @@ def _create_mgf(peptides, mgf_file, random_state=42, mod_aa_mass=None, annotate= """ rng = np.random.default_rng(random_state) entries = [ - _create_mgf_entry(p, rng.choice([2, 3]), mod_aa_mass=mod_aa_mass, annotate=annotate) for p in peptides + _create_mgf_entry( + p, rng.choice([2, 3]), mod_aa_mass=mod_aa_mass, annotate=annotate + ) + for p in peptides ] with mgf_file.open("w+") as mgf_ref: mgf_ref.write("\n".join(entries)) From 4e696b45f92bb270155802b62fc1a3c09dfb7ee1 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Fri, 13 Sep 2024 17:58:18 -0700 Subject: [PATCH 43/84] add mztab validation to dbsearch test --- tests/test_integration.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index b0034a12..3b7ae580 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -20,14 +20,14 @@ def test_db_search( CliRunner().invoke, casanovo.main, catch_exceptions=False ) - output_path = tmp_path / "db_search.mztab" + output_filename = tmp_path / "db_search.mztab" search_args = [ "db-search", "--config", tiny_config, "--output", - str(output_path), + str(output_filename), str(mgf_medium), str(tiny_fasta_file), ] @@ -35,10 +35,10 @@ def test_db_search( result = run(search_args) assert result.exit_code == 0 - assert output_path.exists() - assert output_path.is_file() + assert output_filename.exists() + assert output_filename.is_file() - mztab = pyteomics.mztab.MzTab(str(output_path)) + mztab = pyteomics.mztab.MzTab(str(output_filename)) psms = mztab.spectrum_match_table assert list(psms.sequence) == [ @@ -51,6 +51,30 @@ def test_db_search( "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", ] + # Validate mztab output + validate_args = [ + "java", + "-jar", + f"{TEST_DIR}/jmzTabValidator.jar", + "--check", + f"inFile={output_filename}", + ] + + validate_result = subprocess.run( + validate_args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + assert validate_result.returncode == 0 + assert not any( + [ + line.startswith("[Error-") + for line in validate_result.stdout.splitlines() + ] + ) + def test_train_and_run( mgf_small, mzml_small, tiny_config, tmp_path, monkeypatch From 4655452d94260fa6bdbae4c92506db2a4fc92985 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 17 Sep 2024 09:10:35 -0700 Subject: [PATCH 44/84] lint fix --- casanovo/denovo/model_runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 5ed8f0ec..4a122e9f 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -27,7 +27,6 @@ from ..denovo.model import Spec2Pep, DbSpec2Pep - logger = logging.getLogger("casanovo") From 5e1b9d7ee1f80bf92c9aa3680d4ff2b40d8d693a Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 17 Sep 2024 11:37:23 -0700 Subject: [PATCH 45/84] fix integration test --- casanovo/casanovo.py | 23 ++++++++++++++++++----- tests/test_integration.py | 10 ++++++---- tests/unit_tests/test_unit.py | 4 ++-- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 4b3fdd3c..5550a6dd 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -219,17 +219,30 @@ def db_search( fasta_path: str, model: Optional[str], config: Optional[str], - output: Optional[str], + output_dir: Optional[str], + output_root: Optional[str], verbosity: str, + force_overwrite: bool, ) -> None: """Perform a database search on MS/MS data using Casanovo-DB. PEAK_PATH must be one or more mzML, mzXML, or MGF files. FASTA_PATH must be one FASTA file. """ - output = setup_logging(output, verbosity) - config, model = setup_model(model, config, output, False) - with ModelRunner(config, model) as runner: + output_path, output_root_name = _setup_output( + output_dir, output_root, force_overwrite, verbosity + ) + utils.check_dir_file_exists(output_path, f"{output_root}.mztab") + config, model = setup_model( + model, config, output_path, output_root_name, False + ) + with ModelRunner( + config, + model, + output_path, + output_root_name if output_root is not None else None, + False, + ) as runner: logger.info("Performing database search on:") for peak_file in peak_path: logger.info(" %s", peak_file) @@ -239,7 +252,7 @@ def db_search( runner.db_search( peak_path, fasta_path, - output, + str((output_path / output_root).with_suffix(".mztab")), ) logger.info("DONE!") diff --git a/tests/test_integration.py b/tests/test_integration.py index 59ea1e3a..eeeb498f 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -20,14 +20,17 @@ def test_db_search( CliRunner().invoke, casanovo.main, catch_exceptions=False ) - output_filename = tmp_path / "db_search.mztab" + output_rootname = "db" + output_filename = (tmp_path / output_rootname).with_suffix(".mztab") search_args = [ "db-search", "--config", tiny_config, - "--output", - str(output_filename), + "--output_dir", + str(tmp_path), + "--output_root", + output_rootname, str(mgf_medium), str(tiny_fasta_file), ] @@ -36,7 +39,6 @@ def test_db_search( assert result.exit_code == 0 assert output_filename.exists() - assert output_filename.is_file() mztab = pyteomics.mztab.MzTab(str(output_filename)) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 3f0699ab..59e29b34 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -654,10 +654,10 @@ def test_digest_fasta_mods(tiny_fasta_file, residues_dict): "+43.006ASQSVSSSYLTWYQQKPGQAPR", "-17.027FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", - "FSGSGSGTDFTLTISSLQ+0.984PEDFAVYYC+57.021QQDYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021Q+0.984QDYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQ+0.984DYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYN+0.984LP", + "FSGSGSGTDFTLTISSLQ+0.984PEDFAVYYC+57.021QQDYNLP", "+43.006-17.027FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "+42.011FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "+43.006FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", @@ -828,8 +828,8 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): "QQKPGQ", "SLQPED", "PAQLLF", - "IPARFS", "SIPARF", + "IPARFS", "LSC+57.021RAS", "TDFTLT", "QAPRLL", From 4d6b726dca7be84fcc74537a4dcf3229c93c6d8c Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 17 Sep 2024 11:42:17 -0700 Subject: [PATCH 46/84] fix unit tests --- tests/unit_tests/test_unit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 59e29b34..3f0699ab 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -654,10 +654,10 @@ def test_digest_fasta_mods(tiny_fasta_file, residues_dict): "+43.006ASQSVSSSYLTWYQQKPGQAPR", "-17.027FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "FSGSGSGTDFTLTISSLQ+0.984PEDFAVYYC+57.021QQDYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021Q+0.984QDYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQ+0.984DYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYN+0.984LP", - "FSGSGSGTDFTLTISSLQ+0.984PEDFAVYYC+57.021QQDYNLP", "+43.006-17.027FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "+42.011FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "+43.006FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", @@ -828,8 +828,8 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): "QQKPGQ", "SLQPED", "PAQLLF", - "SIPARF", "IPARFS", + "SIPARF", "LSC+57.021RAS", "TDFTLT", "QAPRLL", From e7f0fdca13b10fd2401b34c21a574774cbbd7de4 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Fri, 20 Sep 2024 17:41:54 -0700 Subject: [PATCH 47/84] force fix test --- casanovo/data/db_utils.py | 4 +++- tests/unit_tests/test_unit.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 34671eb1..c68d208c 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -212,7 +212,9 @@ def _digest_fasta( [(pep, mass) for pep, mass, _ in mod_peptide_list], columns=["peptide", "calc_mass"], ) - pep_table.sort_values(by=["calc_mass", "peptide"], inplace=True) + pep_table.sort_values( + by=["calc_mass", "peptide"], ascending=True, inplace=True + ) logger.info( "Digestion complete. %d peptides generated.", len(pep_table) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 3f0699ab..e8562f49 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -680,7 +680,7 @@ def test_digest_fasta_mods(tiny_fasta_file, residues_dict): residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) - assert peptide_list == expected_1mod + assert set(peptide_list) == set(expected_1mod) def test_length_restrictions(tiny_fasta_file, residues_dict): @@ -981,7 +981,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) - assert peptide_list == expected_nonspecific + assert set(peptide_list) == set(expected_nonspecific) def test_get_candidates(tiny_fasta_file, residues_dict): From 813fac0fab5bdad0a2f8b1187c1a9f61299407d5 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Fri, 20 Sep 2024 17:52:15 -0700 Subject: [PATCH 48/84] clean up test_digest_fasta_enzyme --- tests/unit_tests/test_unit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index e8562f49..8564ffcd 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -828,8 +828,8 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): "QQKPGQ", "SLQPED", "PAQLLF", - "IPARFS", "SIPARF", + "IPARFS", "LSC+57.021RAS", "TDFTLT", "QAPRLL", @@ -981,7 +981,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) - assert set(peptide_list) == set(expected_nonspecific) + assert peptide_list == expected_nonspecific def test_get_candidates(tiny_fasta_file, residues_dict): From 310c3fda82d778a2f69077a7f8a1373ead3a1fd8 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Fri, 20 Sep 2024 18:01:50 -0700 Subject: [PATCH 49/84] adjust test_digest_fasta_mods --- tests/unit_tests/test_unit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 8564ffcd..59e29b34 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -654,10 +654,10 @@ def test_digest_fasta_mods(tiny_fasta_file, residues_dict): "+43.006ASQSVSSSYLTWYQQKPGQAPR", "-17.027FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", - "FSGSGSGTDFTLTISSLQ+0.984PEDFAVYYC+57.021QQDYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021Q+0.984QDYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQ+0.984DYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYN+0.984LP", + "FSGSGSGTDFTLTISSLQ+0.984PEDFAVYYC+57.021QQDYNLP", "+43.006-17.027FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "+42.011FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "+43.006FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", @@ -680,7 +680,7 @@ def test_digest_fasta_mods(tiny_fasta_file, residues_dict): residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) - assert set(peptide_list) == set(expected_1mod) + assert peptide_list == expected_1mod def test_length_restrictions(tiny_fasta_file, residues_dict): From 775def7435580f1b3f5cc73e717d9706892388da Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Wed, 2 Oct 2024 08:43:00 -0700 Subject: [PATCH 50/84] allows top_match filtering for casanovo-db --- casanovo/config.yaml | 4 ++-- casanovo/data/psm.py | 3 +++ casanovo/denovo/model.py | 22 +++++++++++++++++----- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 98c8290f..6df7d094 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -21,6 +21,8 @@ min_peptide_len: 6 max_peptide_len: 100 # Number of spectra in one inference batch. predict_batch_size: 1024 +# Number of PSMs for each spectrum. +top_match: 1 ### @@ -29,8 +31,6 @@ predict_batch_size: 1024 # Number of beams used in beam search. n_beams: 1 -# Number of PSMs for each spectrum. -top_match: 1 # The hardware accelerator to use. Must be one of: # "cpu", "gpu", "tpu", "ipu", "hpu", "mps", or "auto". accelerator: "auto" diff --git a/casanovo/data/psm.py b/casanovo/data/psm.py index 0dc3c48b..3c33b4df 100644 --- a/casanovo/data/psm.py +++ b/casanovo/data/psm.py @@ -30,6 +30,8 @@ class PepSpecMatch: aa_scores : Iterable[float] A list of scores for individual amino acids in the peptide sequence, where len(aa_scores) == len(sequence) + protein : str + Protein associated with the peptide sequence (for db mode) """ sequence: str @@ -39,3 +41,4 @@ class PepSpecMatch: calc_mz: float exp_mz: float aa_scores: Iterable[float] + protein: str = None diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 0ac649ac..67d561bc 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1023,12 +1023,13 @@ def predict_step(self, batch, *args): Returns ------- - predictions: List[Tuple[int, int, float, str, np.ndarray, np.ndarray, str]] + predictions: List[Tuple[List[str], int, float, str, np.ndarray, np.ndarray, str]] Model predictions for the given batch of spectra containing spectrum ids, precursor charge and m/z, candidate peptide sequences, peptide - scores, amino acid-level scores, and associated proteins. + scores, amino acid-level scores, and associated proteins. Stored separately by + spectrum id. """ - predictions = [] + store_dict = collections.defaultdict(list) for start_idx in range(0, len(batch[0]), self.psm_batch_size): current_batch = [ b[start_idx : start_idx + self.psm_batch_size] for b in batch @@ -1057,7 +1058,7 @@ def predict_step(self, batch, *args): per_aa_scores.cpu().detach().numpy(), current_batch[3], ): - predictions.append( + store_dict[str(spectrum_i)].append( ( spectrum_i, precursor_charge, @@ -1068,11 +1069,22 @@ def predict_step(self, batch, *args): self.protein_database.get_associated_protein(peptide), ) ) + predictions = [] + for spectrum_i in store_dict: + predictions.extend( + sorted( + store_dict[str(spectrum_i)], + key=lambda x: x[4], + reverse=True, + )[: self.top_match] + ) return predictions def on_predict_batch_end( self, - outputs: List[Tuple[np.ndarray, List[str], torch.Tensor]], + outputs: List[ + Tuple[List[str], int, float, str, np.ndarray, np.ndarray, str] + ], *args, ) -> None: """ From e35c60dcbb7068a1aa5d09f506aacd4ac31ac36c Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Wed, 2 Oct 2024 09:08:46 -0700 Subject: [PATCH 51/84] change default value for protein value in PepSpecMatch --- casanovo/data/psm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/casanovo/data/psm.py b/casanovo/data/psm.py index 3c33b4df..e4ef3af7 100644 --- a/casanovo/data/psm.py +++ b/casanovo/data/psm.py @@ -41,4 +41,4 @@ class PepSpecMatch: calc_mz: float exp_mz: float aa_scores: Iterable[float] - protein: str = None + protein: str = "null" From 79cba590948544e2fdb89a7fb8e00207fa4ed93f Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Wed, 2 Oct 2024 10:30:26 -0700 Subject: [PATCH 52/84] reverse issues with decoder --- casanovo/denovo/model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 67d561bc..8850c3d7 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1148,7 +1148,7 @@ def _calc_match_score( (for an entire batch) """ # Remove trailing tokens from predictions based on decoder reversal - if decoder_reverse: + if not decoder_reverse: batch_all_aa_scores = batch_all_aa_scores[:, 1:] else: batch_all_aa_scores = batch_all_aa_scores[:, :-1] @@ -1163,6 +1163,8 @@ def _calc_match_score( per_aa_scores = batch_all_aa_scores[rows, cols, truth_aa_indices] + logging.debug("$$$$$$$$$$$$$||%s||$$$$$$$$$$$$$$", per_aa_scores) + per_aa_scores[per_aa_scores == 0] += 1e-10 score_mask = truth_aa_indices != 0 per_aa_scores[~score_mask] = 0 From c9eb8b70e4724ff3f4639315129a72ef4adea4bd Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Wed, 2 Oct 2024 10:50:40 -0700 Subject: [PATCH 53/84] update test and remove logging statement --- casanovo/denovo/model.py | 2 -- tests/unit_tests/test_unit.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 8850c3d7..23b777cc 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1163,8 +1163,6 @@ def _calc_match_score( per_aa_scores = batch_all_aa_scores[rows, cols, truth_aa_indices] - logging.debug("$$$$$$$$$$$$$||%s||$$$$$$$$$$$$$$", per_aa_scores) - per_aa_scores[per_aa_scores == 0] += 1e-10 score_mask = truth_aa_indices != 0 per_aa_scores[~score_mask] = 0 diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index e5ac2253..5eee0e4e 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -510,7 +510,7 @@ def test_calc_match_score(): ) all_scores, masked_per_aa_scores = _calc_match_score( - batch_all_aa_scores, truth_aa_indices + batch_all_aa_scores, truth_aa_indices, True ) assert all_scores.numpy()[0] == 0 From 68e67e833607aeb8e5856181f1e74e357e993235 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Sun, 3 Nov 2024 06:08:00 -0800 Subject: [PATCH 54/84] db_utils fixes --- casanovo/casanovo.py | 5 +- casanovo/config.yaml | 20 +++--- casanovo/data/datasets.py | 2 +- casanovo/data/db_utils.py | 148 ++++++++++++++++++-------------------- 4 files changed, 85 insertions(+), 90 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 4feff0cb..5547a807 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -236,6 +236,7 @@ def db_search( config, model = setup_model( model, config, output_path, output_root_name, False ) + start_time = time.time() with ModelRunner( config, model, @@ -246,6 +247,7 @@ def db_search( logger.info("Performing database search on:") for peak_file in peak_path: logger.info(" %s", peak_file) + logger.info("Using the following FASTA file:") logger.info(" %s", fasta_path) @@ -254,8 +256,7 @@ def db_search( fasta_path, str((output_path / output_root).with_suffix(".mztab")), ) - - logger.info("DONE!") + utils.log_run_report(start_time=start_time, end_time=time.time()) @main.command(cls=_SharedParams) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 6df7d094..014f02ee 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -23,6 +23,13 @@ max_peptide_len: 100 predict_batch_size: 1024 # Number of PSMs for each spectrum. top_match: 1 +# The hardware accelerator to use. Must be one of: +# "cpu", "gpu", "tpu", "ipu", "hpu", "mps", or "auto". +accelerator: "auto" +# The devices to use. Can be set to a positive number int, or the value -1 to +# indicate all available devices should be used. If left empty, the appropriate +# number will be automatically selected for based on the chosen accelerator. +devices: ### @@ -31,13 +38,6 @@ top_match: 1 # Number of beams used in beam search. n_beams: 1 -# The hardware accelerator to use. Must be one of: -# "cpu", "gpu", "tpu", "ipu", "hpu", "mps", or "auto". -accelerator: "auto" -# The devices to use. Can be set to a positive number int, or the value -1 to -# indicate all available devices should be used. If left empty, the appropriate -# number will be automatically selected for based on the chosen accelerator. -devices: ### @@ -46,7 +46,7 @@ devices: # Enzyme for in silico digestion, used to generate candidate peptides. # See pyteomics.parser.expasy_rules for valid enzymes. -# Can also take a regex expression to specify custom digestion rules. +# Can also take a regex to specify custom digestion rules. enzyme: "trypsin" # Digestion type for candidate peptide generation. # full: standard digestion. @@ -60,9 +60,9 @@ missed_cleavages: 0 max_mods: 1 # Select which modifications from the vocabulary can be used in candidate creation. # Format: Comma-separated list of "aa:mod_residue", -# where aa is a standard amino acid or "nterm" for an N-terminal mod +# where aa is a standard amino acid (or "nterm" for an N-terminal mod) # and mod_residue is a key from the "residues" dictionary. -# Example: "M:M+15.995,X:+43.006-17.027" +# Example: "M:M+15.995,nterm:+43.006" allowed_fixed_mods: "C:C+57.021" allowed_var_mods: "M:M+15.995,N:N+0.984,Q:Q+0.984,nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py index 33d84e49..3917a2c8 100644 --- a/casanovo/data/datasets.py +++ b/casanovo/data/datasets.py @@ -1,6 +1,6 @@ """A PyTorch Dataset class for annotated spectra.""" -from typing import List, Optional, Tuple +from typing import Optional, Tuple import depthcharge import numpy as np diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index c68d208c..8d141117 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -24,7 +24,8 @@ class ProteinDatabase: """ - Store digested .fasta data and return candidate peptides for a given precursor mass. + Store digested .fasta data and return candidate peptides + for a given precursor mass. Parameters ---------- @@ -34,7 +35,8 @@ class ProteinDatabase: The enzyme to use for digestion. See pyteomics.parser.expasy_rules for valid enzymes. digestion : str - The type of digestion to perform. Either 'full' or 'partial'. + The type of digestion to perform. + Either 'full', 'partial' or 'non-specific'. missed_cleavages : int The number of missed cleavages to allow. min_peptide_len : int @@ -46,12 +48,13 @@ class ProteinDatabase: precursor_tolerance : float The precursor mass tolerance in ppm. isotope_error : Tuple[int, int] - Isotope range [min, max] to consider when comparing predicted and observed precursor m/z's. + Isotope range [min, max] to consider when comparing predicted + and observed precursor m/z's. allowed_fixed_mods : str A comma separated string of fixed modifications to consider. allowed_var_mods : str A comma separated string of variable modifications to consider. - residues : dict + residues : dict[str, float] A dictionary of amino acid masses. """ @@ -68,7 +71,7 @@ def __init__( isotope_error: Tuple[int, int], allowed_fixed_mods: str, allowed_var_mods: str, - residues: dict, + residues: dict[str, float], ): self.fixed_mods, self.var_mods, self.swap_map = _construct_mods_dict( allowed_fixed_mods, allowed_var_mods @@ -84,7 +87,7 @@ def __init__( missed_cleavages, min_peptide_len, max_peptide_len, - set(list(residues.keys()) + ["C"]), + set([aa[0] for aa in residues.keys() if aa[0].isalpha()]), ) self.db_peptides, self.prot_map = self._digest_fasta(peptide_generator) self.precursor_tolerance = precursor_tolerance @@ -94,9 +97,10 @@ def get_candidates( self, precursor_mz: float, charge: int, - ) -> List[Tuple[str, str]]: + ) -> pd.Series: """ - Returns a list of candidate peptides that fall within the specified mass range. + Returns a list of candidate peptides that fall within the + specified mass range. Parameters ---------- @@ -115,7 +119,7 @@ def get_candidates( for e in range(self.isotope_error[0], self.isotope_error[1] + 1): iso_shift = ISOTOPE_SPACING * e shift_raw_mass = float( - _to_raw_mass(precursor_mz, charge) - iso_shift + _to_neutral_mass(precursor_mz, charge) - iso_shift ) upper_bound = shift_raw_mass * ( 1 + (self.precursor_tolerance / 1e6) @@ -154,9 +158,10 @@ def get_associated_protein(self, peptide: str) -> str: def _digest_fasta( self, peptide_generator: Iterator[Tuple[str, str]], - ) -> Tuple[pd.DataFrame, dict]: + ) -> Tuple[pd.DataFrame, dict[str, str]]: """ - Digests a FASTA file and returns the peptides, their masses, and associated protein. + Digests a FASTA file and returns the peptides, their masses, + and associated protein. Parameters ---------- @@ -168,13 +173,9 @@ def _digest_fasta( pep_table : pd.DataFrame A Pandas DataFrame with peptide and mass columns. Sorted by neutral mass in ascending order. - prot_map : dict + prot_map : dict[str, str] A dictionary mapping peptides to associated proteins. """ - peptide_list = [] - for pep, prot in peptide_generator: - peptide_list.append((pep, prot)) - # Generate modified peptides mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") peptide_isoforms = [ @@ -187,7 +188,7 @@ def _digest_fasta( ), prot, ) - for pep, prot in peptide_list + for pep, prot in peptide_generator ] mod_peptide_list = [ (mod_pep, mass_calculator.mass(mod_pep), prot) @@ -203,9 +204,9 @@ def _digest_fasta( ] # Create a dictionary mapping for easy accession of associated proteins - prot_map = defaultdict(list) + prot_map = defaultdict(set) for pep, _, prot in mod_peptide_list: - prot_map[pep].append(prot) + prot_map[pep].add(prot) # Create a DataFrame for easy sorting and filtering pep_table = pd.DataFrame( @@ -227,8 +228,8 @@ def _peptide_generator( enzyme: str, digestion: str, missed_cleavages: int, - min_peptide_length: int, - max_peptide_length: int, + min_peptide_len: int, + max_peptide_len: int, valid_aa: set[str], ) -> Iterator[str]: """ @@ -242,14 +243,15 @@ def _peptide_generator( enzyme : str The enzyme to use for digestion. See pyteomics.parser.expasy_rules for valid enzymes. - Can also be a regex pattern. + Can also be a regex. digestion : str - The type of digestion to perform. Either 'full', 'partial' or 'non-specific'. + The type of digestion to perform. + Either 'full', 'partial' or 'non-specific'. missed_cleavages : int The number of missed cleavages to allow. - min_peptide_length : int + min_peptide_len : int The minimum length of peptides to consider. - max_peptide_length : int + max_peptide_len : int The maximum length of peptides to consider. valid_aa : set[str] A set of valid amino acids. @@ -261,19 +263,6 @@ def _peptide_generator( protein : str The associated protein. """ - # Verify the existence of the file: - if not os.path.isfile(fasta_filename): - logger.error("File %s does not exist.", fasta_filename) - raise FileNotFoundError(f"File {fasta_filename} does not exist.") - if digestion not in ["full", "partial", "non-specific"]: - logger.error("Digestion type %s not recognized.", digestion) - raise ValueError(f"Digestion type {digestion} not recognized.") - if enzyme not in parser.expasy_rules: - logger.info( - "Enzyme %s not recognized. Interpreting as cleavage rule.", - enzyme, - ) - # Verify the existence of the file: if not os.path.isfile(fasta_filename): logger.error("File %s does not exist.", fasta_filename) @@ -292,12 +281,12 @@ def _peptide_generator( # Generate all possible peptides for i in range(len(seq)): for j in range( - i + min_peptide_length, - min(i + max_peptide_length + 1, len(seq) + 1), + i + min_peptide_len, + min(i + max_peptide_len + 1, len(seq) + 1), ): pep = seq[i:j] if any(aa not in valid_aa for aa in pep): - logger.warn( + logger.warning( "Skipping peptide with unknown amino acids: %s", pep, ) @@ -314,12 +303,9 @@ def _peptide_generator( ) protein = header.split()[0] for pep in pep_set: - if ( - len(pep) >= min_peptide_length - and len(pep) <= max_peptide_length - ): + if len(pep) >= min_peptide_len and len(pep) <= max_peptide_len: if any(aa not in valid_aa for aa in pep): - logger.warn( + logger.warning( "Skipping peptide with unknown amino acids: %s", pep, ) @@ -348,7 +334,7 @@ def _to_mz(precursor_mass: float, charge: int) -> float: @njit -def _to_raw_mass(mz_mass: float, charge: int) -> float: +def _to_neutral_mass(mz_mass: float, charge: int) -> float: """ Convert precursor m/z value to neutral mass. @@ -367,23 +353,33 @@ def _to_raw_mass(mz_mass: float, charge: int) -> float: return charge * (mz_mass - PROTON) -def _convert_from_modx(seq: str, swap_map: dict, swap_regex: str) -> str: - """Converts peptide sequence from modX format to Casanovo-acceptable modifications. - - Args: - seq : str - Peptide in modX format - swap_map : dict - Dictionary that allows for swapping of modX to Casanovo-acceptable modifications. - swap_regex : str - Regular expression to match modX format. +def _convert_from_modx( + seq: str, swap_map: dict[str, str], swap_regex: str +) -> str: + """ + Converts peptide sequence from modX format to + Casanovo-acceptable modifications. + + Parameters: + ----------- + seq : str + Peptide in modX format + swap_map : dict[str, str] + Dictionary that allows for swapping of modX to Casanovo-acceptable modifications. + swap_regex : str + Regular expression to match modX format. + + Returns: + -------- + swap_regex : str + Peptide in Casanovo-acceptable modifications. """ return swap_regex.sub(lambda x: swap_map[x.group()], seq) def _construct_mods_dict( allowed_fixed_mods: str, allowed_var_mods: str -) -> Tuple[dict, dict, dict]: +) -> Tuple[dict[str, str], dict[str, str], dict[str, str]]: """ Constructs dictionaries of fixed and variable modifications. @@ -396,30 +392,28 @@ def _construct_mods_dict( Returns ------- - fixed_mods : dict + fixed_mods : dict[str, str] A dictionary of fixed modifications. - var_mods : dict + var_mods : dict[str, str] A dictionary of variable modifications. - swap_map : dict - A dictionary that allows for swapping of modX to Casanovo-acceptable modifications. + swap_map : dict[str, str] + A dictionary that allows for swapping of modX to + Casanovo-acceptable modifications. """ swap_map = {} fixed_mods = {} - for idx, mod in enumerate(allowed_fixed_mods.split(",")): - aa, mod_aa = mod.split(":") - mod_id = string.ascii_lowercase[idx] - fixed_mods[mod_id] = [aa] - swap_map[f"{mod_id}{aa}"] = f"{mod_aa}" - var_mods = {} - for idx, mod in enumerate(allowed_var_mods.split(",")): - aa, mod_aa = mod.split(":") - mod_id = string.ascii_lowercase[idx] - if aa == "nterm": - var_mods[f"{mod_id}-"] = True - swap_map[f"{mod_id}-"] = f"{mod_aa}" - else: - var_mods[mod_id] = [aa] - swap_map[f"{mod_id}{aa}"] = f"{mod_aa}" + for mod_map, allowed_mods in zip( + [fixed_mods, var_mods], [allowed_fixed_mods, allowed_var_mods] + ): + for idx, mod in enumerate(allowed_mods.split(",")): + aa, mod_aa = mod.split(":") + mod_id = string.ascii_lowercase[idx] + if aa == "nterm": + mod_map[f"{mod_id}-"] = True + swap_map[f"{mod_id}-"] = f"{mod_aa}" + else: + mod_map[mod_id] = [aa] + swap_map[f"{mod_id}{aa}"] = f"{mod_aa}" return fixed_mods, var_mods, swap_map From d01dd7fda222acff904575e7df528df05f11abbd Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Sun, 3 Nov 2024 11:39:35 -0800 Subject: [PATCH 55/84] updates to dataloaders, model_runner, and model.py --- casanovo/denovo/dataloaders.py | 12 ++-- casanovo/denovo/model.py | 108 +++++++++++++------------------- casanovo/denovo/model_runner.py | 51 ++++++--------- tests/conftest.py | 102 ++---------------------------- 4 files changed, 72 insertions(+), 201 deletions(-) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 2646329d..4eb4d2e2 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -137,6 +137,7 @@ def _make_loader( dataset: torch.utils.data.Dataset, batch_size: int, shuffle: bool = False, + collate_fn: Optional[callable] = None, ) -> torch.utils.data.DataLoader: """ Create a PyTorch DataLoader. @@ -149,6 +150,8 @@ def _make_loader( The batch size to use. shuffle : bool Option to shuffle the batches. + collate_fn : Optional[callable] + A function to collate the data into a batch. Returns ------- @@ -158,7 +161,7 @@ def _make_loader( return torch.utils.data.DataLoader( dataset, batch_size=batch_size, - collate_fn=prepare_batch, + collate_fn=prepare_batch if collate_fn is None else collate_fn, pin_memory=True, num_workers=self.n_workers, shuffle=shuffle, @@ -184,15 +187,12 @@ def predict_dataloader(self) -> torch.utils.data.DataLoader: def db_dataloader(self) -> torch.utils.data.DataLoader: """Get a special dataloader for DB search""" - return torch.utils.data.DataLoader( + return self._make_loader( self.test_dataset, - batch_size=self.eval_batch_size, + self.eval_batch_size, collate_fn=functools.partial( prepare_psm_batch, protein_database=self.protein_database ), - pin_memory=True, - num_workers=self.n_workers, - shuffle=False, ) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 40328701..40d0cc3d 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -994,17 +994,12 @@ class DbSpec2Pep(Spec2Pep): Subclass of Spec2Pep for the use of Casanovo as an MS/MS database search score function. - Uses teacher forcing to 'query' Casanovo for its score for each AA - within a candidate peptide, and takes the geometric average of these scores - and reports this as the score for the spectrum-peptide pair. Note that the - geometric mean of the AA scores is actually calculated by a - summation and average of the log of the scores, to preserve numerical - stability. This does not affect PSM ranking. + Uses teacher forcing to 'query' Casanovo to score a peptide-spectrum + pair. Higher scores indicate a better match between the peptide and + spectrum. The amino acid-level scores are also returned. Also note that although teacher-forcing is used within this method, there is *no training* involved. This is a prediction-only method. - - Output is provided in .mztab format. """ def __init__(self, *args, **kwargs): @@ -1034,17 +1029,15 @@ def predict_step(self, batch, *args): current_batch = [ b[start_idx : start_idx + self.psm_batch_size] for b in batch ] - pred, truth = self.decoder( - current_batch[3], - current_batch[1], - *self.encoder(current_batch[0]), + pred, truth = self._forward_step( + current_batch[0], current_batch[1], current_batch[3] ) pred = self.softmax(pred) - all_scores, per_aa_scores = _calc_match_score( + all_peptide_scores, all_aa_scores = _calc_match_score( pred, truth, self.decoder.reverse ) for ( - precursor_charge, + charge, precursor_mz, spectrum_i, peptide_score, @@ -1054,27 +1047,32 @@ def predict_step(self, batch, *args): current_batch[1][:, 1].cpu().detach().numpy(), current_batch[1][:, 2].cpu().detach().numpy(), current_batch[2], - all_scores.cpu().detach().numpy(), - per_aa_scores.cpu().detach().numpy(), + all_peptide_scores, + all_aa_scores, current_batch[3], ): - store_dict[str(spectrum_i)].append( - ( - spectrum_i, - precursor_charge, - precursor_mz, - peptide, - peptide_score, - aa_scores, - self.protein_database.get_associated_protein(peptide), + store_dict[spectrum_i].append( + ms_io.PepSpecMatch( + sequence=peptide, + spectrum_id=tuple(spectrum_i), + peptide_score=peptide_score, + charge=int(charge), + calc_mz=precursor_mz, + exp_mz=self.peptide_mass_calculator.mass( + peptide, charge + ), + aa_scores=aa_scores, + protein=self.protein_database.get_associated_protein( + peptide + ), ) ) predictions = [] for spectrum_i in store_dict: predictions.extend( sorted( - store_dict[str(spectrum_i)], - key=lambda x: x[4], + store_dict[spectrum_i], + key=lambda x: x.peptide_score, reverse=True, )[: self.top_match] ) @@ -1090,27 +1088,7 @@ def on_predict_batch_end( """ Write the database search results to the output file. """ - for ( - spectrum_i, - charge, - precursor_mz, - peptide, - peptide_score, - aa_scores, - protein, - ) in outputs: - self.out_writer.psms.append( - ms_io.PepSpecMatch( - sequence=peptide, - spectrum_id=tuple(spectrum_i), - peptide_score=peptide_score, - charge=int(charge), - calc_mz=precursor_mz, - exp_mz=self.peptide_mass_calculator.mass(peptide, charge), - aa_scores=aa_scores, - protein=protein, - ) - ) + self.out_writer.psms.extend(outputs) def _calc_match_score( @@ -1124,8 +1102,7 @@ def _calc_match_score( Take in teacher-forced scoring of amino acids of the peptides (in a batch) and use the truth labels to calculate a score between the input spectra and - associated peptide. The score is the geometric - mean of the AA probabilities + associated peptide. Parameters ---------- @@ -1134,18 +1111,19 @@ def _calc_match_score( the vocabulary for every prediction made to generate the associated peptide (for an entire batch) truth_aa_indices : torch.Tensor - Indicies of the score for each actual amino acid + Indices of the score for each actual amino acid in the peptide (for an entire batch) decoder_reverse : bool Whether the decoder is reversed. Returns ------- - (all_scores, per_aa_scores) : Tuple[torch.Tensor, torch.Tensor] + all_peptide_scores: List[float] The score between the input spectra and associated peptide - (for an entire batch) - a list of lists of per amino acid scores - (for an entire batch) + for each PSM in the batch. + all_aa_scores : List[List[float]] + A list of lists of per amino acid scores + for each PSM in the batch. """ # Remove trailing tokens from predictions based on decoder reversal if not decoder_reverse: @@ -1162,19 +1140,19 @@ def _calc_match_score( cols = torch.arange(0, batch_all_aa_scores.shape[1]).expand_as(rows) per_aa_scores = batch_all_aa_scores[rows, cols, truth_aa_indices] - + per_aa_scores = per_aa_scores.cpu().detach().numpy() per_aa_scores[per_aa_scores == 0] += 1e-10 score_mask = truth_aa_indices != 0 per_aa_scores[~score_mask] = 0 - log_per_aa_scores = torch.log(per_aa_scores) - all_scores = torch.where( - log_per_aa_scores == float("-inf"), - torch.tensor(0.0), - log_per_aa_scores, - ).sum(dim=1) / score_mask.sum( - dim=1 - ) # Calculates geometric score - return all_scores, per_aa_scores + all_peptide_scores = [] + all_aa_scores = [] + for psm_score in per_aa_scores: + psm_score = np.trim_zeros(psm_score) + aa_scores, peptide_score = _aa_pep_score(psm_score, True) + all_peptide_scores.append(peptide_score) + all_aa_scores.append(aa_scores) + + return all_peptide_scores, all_aa_scores class CosineWarmupScheduler(torch.optim.lr_scheduler._LRScheduler): diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index b1b1046f..b097f6d5 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -127,24 +127,24 @@ def db_search( self, peak_path: Iterable[str], fasta_path: str, - output: str, + results_path: str, ) -> None: """Perform database search with Casanovo. Parameters ---------- peak_path : Iterable[str] - The paths to the .mgf data files for database search. + The path with the MS data files for database search. fasta_path : str - The path to the FASTA file for database search. - output : str - Where should the output be saved? + The path with the FASTA file for database search. + results_path : str + Sequencing results file path Returns ------- self """ - self.writer = ms_io.MztabWriter(Path(output).with_suffix(".mztab")) + self.writer = ms_io.MztabWriter(results_path) self.writer.set_metadata( self.config, model=str(self.model_filename), @@ -266,7 +266,7 @@ def predict( Parameters ---------- - peak_path : iterable of str + peak_path : Iterable[str] The path with the MS data files for predicting peptide sequences. results_path : str Sequencing results file path @@ -431,12 +431,12 @@ def initialize_model( ) if self.model_filename is None: - # Train a model from scratch if no model file is provided. if db_search: logger.error("DB search mode requires a model file") raise ValueError( "A model file must be provided for DB search mode" ) + # Train a model from scratch if no model file is provided. if train: self.model = Spec2Pep(**model_params) return @@ -456,19 +456,13 @@ def initialize_model( # First try loading model details from the weights file, otherwise use # the provided configuration. device = torch.empty(1).device # Use the default device. + Model = DbSpec2Pep if db_search else Spec2Pep try: - if db_search: - self.model = DbSpec2Pep.load_from_checkpoint( - self.model_filename, - map_location=device, - **loaded_model_params, - ) - else: - self.model = Spec2Pep.load_from_checkpoint( - self.model_filename, - map_location=device, - **loaded_model_params, - ) + self.model = Model.load_from_checkpoint( + self.model_filename, + map_location=device, + **loaded_model_params, + ) architecture_params = set(model_params.keys()) - set( loaded_model_params.keys() @@ -484,18 +478,11 @@ def initialize_model( except RuntimeError: # This only doesn't work if the weights are from an older version try: - if db_search: - self.model = DbSpec2Pep.load_from_checkpoint( - self.model_filename, - map_location=device, - **model_params, - ) - else: - self.model = Spec2Pep.load_from_checkpoint( - self.model_filename, - map_location=device, - **model_params, - ) + self.model = Model.load_from_checkpoint( + self.model_filename, + map_location=device, + **model_params, + ) except RuntimeError: raise RuntimeError( "Weights file incompatible with the current version of " diff --git a/tests/conftest.py b/tests/conftest.py index 94b2d744..1729dcb3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ """Fixtures used for testing.""" +import depthcharge import numpy as np import pandas as pd import psims @@ -108,9 +109,9 @@ def _create_mgf_entry(peptide, charge=2, mod_aa_mass=None, annotate=True): The PSM entry in an MGF file format. """ if mod_aa_mass is None: - precursor_mz = calculate_mass(peptide, charge=int(charge)) + precursor_mz = fast_mass(peptide, charge=int(charge)) else: - aa_mass = std_aa_mass + aa_mass = std_aa_mass.copy() aa_mass.update(mod_aa_mass) precursor_mz = fast_mass(peptide, charge=int(charge), aa_mass=aa_mass) mzs, intensities = _peptide_to_peaks(peptide, charge) @@ -332,99 +333,4 @@ def tiny_config(tmp_path): @pytest.fixture def residues_dict(): - return { - "G": 57.021464, - "A": 71.037114, - "S": 87.032028, - "P": 97.052764, - "V": 99.068414, - "T": 101.047670, - "C+57.021": 160.030649, - "L": 113.084064, - "I": 113.084064, - "N": 114.042927, - "D": 115.026943, - "Q": 128.058578, - "K": 128.094963, - "E": 129.042593, - "M": 131.040485, - "H": 137.058912, - "F": 147.068414, - "R": 156.101111, - "Y": 163.063329, - "W": 186.079313, - "M+15.995": 147.035400, - "N+0.984": 115.026943, - "Q+0.984": 129.042594, - "+42.011": 42.010565, - "+43.006": 43.005814, - "-17.027": -17.026549, - "+43.006-17.027": 25.980265, - } - - -@pytest.fixture -def tide_dir_small(tmp_path): - """A directory with a very small TIDE search result.""" - tide_dir = tmp_path / "tide_results" - tide_dir.mkdir() - - # Key is the scan number - built_dict = { - 0: { - "targets": ["LESLIEK", "PEPTIDEK"], - "decoys": ["KEILSEL", "KEDITEPP"], - }, - 1: { - "targets": ["LESLIEK", "PEPTIDEK"], - "decoys": ["KEILSEL", "KEDITEPP"], - }, - 2: { - "targets": [ - "L[42.011]EM[15.9]SLIM[15.995]EK", - "P[43.01]EN[0.99]PTIQ[0.984]DEK", - ], - "decoys": [ - "K[-17.03]M[15.995]EILSEL", - "K[25.1]EDITEPP", - "KEDIQ[0.984]TEPPQ[0.984]", - ], - }, - } - - _create_tide_results_target(tide_dir, built_dict) - _create_tide_results_decoy(tide_dir, built_dict) - - return tide_dir - - -def _create_tide_results_target(tide_dir, built_dict): - """Create a fake TIDE search result file (target).""" - out_file = tide_dir / "tide-search.target.txt" - df = pd.DataFrame(columns=["scan", "sequence", "target/decoy"]) - for scan, peptides in built_dict.items(): - entry = pd.DataFrame.from_dict( - { - "scan": [scan] * len(peptides["targets"]), - "sequence": peptides["targets"], - "target/decoy": ["target"] * len(peptides["targets"]), - } - ) - df = pd.concat([df, entry], ignore_index=True) - df.to_csv(out_file, sep="\t", index=True) - - -def _create_tide_results_decoy(tide_dir, built_dict): - """Create a fake TIDE search result file (decoy).""" - out_file = tide_dir / "tide-search.decoy.txt" - df = pd.DataFrame(columns=["scan", "sequence", "target/decoy"]) - for scan, peptides in built_dict.items(): - entry = pd.DataFrame.from_dict( - { - "scan": [scan] * len(peptides["decoys"]), - "sequence": peptides["decoys"], - "target/decoy": ["decoy"] * len(peptides["decoys"]), - } - ) - df = pd.concat([df, entry], ignore_index=True) - df.to_csv(out_file, sep="\t", index=True) + return depthcharge.masses.PeptideMass("massivekb").masses From d5819442011e29c4b24743cedb56a604e21a7afb Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Sun, 3 Nov 2024 12:26:18 -0800 Subject: [PATCH 56/84] near final changes for all but db_utils --- casanovo/data/db_utils.py | 20 ------------ tests/unit_tests/test_unit.py | 59 +++++++++++++++++++++++++++++------ 2 files changed, 49 insertions(+), 30 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 8d141117..e704907c 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -313,26 +313,6 @@ def _peptide_generator( yield pep, protein -@njit -def _to_mz(precursor_mass: float, charge: int) -> float: - """ - Convert precursor neutral mass to m/z value. - - Parameters - ---------- - precursor_mass : float - The precursor neutral mass. - charge : int - The precursor charge. - - Returns - ------- - mz : float - The calculated precursor mass-to-charge ratio. - """ - return (precursor_mass + (charge * PROTON)) / charge - - @njit def _to_neutral_mass(mz_mass: float, charge: int) -> float: """ diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 482ef853..034f4874 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -453,6 +453,35 @@ def test_aa_pep_score(): assert peptide_score == pytest.approx(0.5) +def test_peptide_generator_errors(residues_dict, tiny_fasta_file): + with pytest.raises(FileNotFoundError) as e_info: + [ + (a, b) + for a, b in db_utils._peptide_generator( + "fail.fasta", "trypsin", "full", 0, 5, 10, residues_dict + ) + ] + with pytest.raises(ValueError) as e_info: + [ + (a, b) + for a, b in db_utils._peptide_generator( + tiny_fasta_file, "trypsin", "fail", 0, 5, 10, residues_dict + ) + ] + + +def test_to_neutral_mass(): + mz = 500 + charge = 2 + neutral_mass = db_utils._to_neutral_mass(mz, charge) + assert neutral_mass == 997.98544706646 + + mz = 500 + charge = 1 + neutral_mass = db_utils._to_neutral_mass(mz, charge) + assert neutral_mass == 498.99272353323 + + def test_calc_match_score(): """ Test the calculation of geometric scores using teacher-forced @@ -518,19 +547,29 @@ def test_calc_match_score(): batch_all_aa_scores, truth_aa_indices, True ) - assert all_scores.numpy()[0] == 0 - assert all_scores.numpy()[1] == 0 - assert all_scores.numpy()[2] == pytest.approx( - np.log(0.5 * 0.5 * 1 * 1) / 4 + assert all_scores[0] == np.exp(0) + assert all_scores[1] == np.exp(0) + assert all_scores[2] == pytest.approx( + np.exp(np.log(0.5 * 0.5 * 1 * 1) / 4) ) - assert all_scores.numpy()[3] == pytest.approx( - np.log(1e-10 * 1 * 1 * 1) / 4 + assert all_scores[3] == pytest.approx( + np.exp(np.log(1e-10 * 1 * 1 * 1) / 4) ) - assert np.sum(masked_per_aa_scores.numpy()[0]) == 4 - assert np.sum(masked_per_aa_scores.numpy()[1]) == 3 - assert np.sum(masked_per_aa_scores.numpy()[2]) == 3 - assert np.sum(masked_per_aa_scores.numpy()[3]) == 3 + aa_scores = np.array([1, 1, 1, 1]) + assert np.allclose(masked_per_aa_scores[0], (aa_scores + 1) / 2) + aa_scores = np.array([1, 1, 1]) + assert np.allclose(masked_per_aa_scores[1], (aa_scores + 1) / 2) + aa_scores = np.array([0.5, 0.5, 1, 1]) + assert np.allclose( + masked_per_aa_scores[2], + (aa_scores + np.exp(np.log(0.5 * 0.5 * 1 * 1) / 4)) / 2, + ) + aa_scores = np.array([1e-10, 1, 1, 1]) + assert np.allclose( + masked_per_aa_scores[3], + (aa_scores + np.exp(np.log(1e-10 * 1 * 1 * 1) / 4)) / 2, + ) def test_digest_fasta_cleave(tiny_fasta_file, residues_dict): From 092fa2a6da155c93898c8581a1bc6de7c72a9827 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Sun, 3 Nov 2024 12:45:35 -0800 Subject: [PATCH 57/84] line length fixes --- casanovo/casanovo.py | 5 +++-- casanovo/denovo/model.py | 4 ++-- tests/conftest.py | 8 ++++++-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 5547a807..01098255 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -62,8 +62,9 @@ def __init__(self, *args, **kwargs) -> None: click.Option( ("-m", "--model"), help=""" - Either the model weights (.ckpt file) or a URL pointing to the model weights - file. If not provided, Casanovo will try to download the latest release automatically. + Either the model weights (.ckpt file) or a URL pointing to + the model weights file. If not provided, + Casanovo will try to download the latest release automatically. """, ), click.Option( diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 40d0cc3d..5e807153 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1021,8 +1021,8 @@ def predict_step(self, batch, *args): predictions: List[Tuple[List[str], int, float, str, np.ndarray, np.ndarray, str]] Model predictions for the given batch of spectra containing spectrum ids, precursor charge and m/z, candidate peptide sequences, peptide - scores, amino acid-level scores, and associated proteins. Stored separately by - spectrum id. + scores, amino acid-level scores, and associated proteins. + Stored separately by spectrum id. """ store_dict = collections.defaultdict(list) for start_idx in range(0, len(batch[0]), self.psm_batch_size): diff --git a/tests/conftest.py b/tests/conftest.py index 1729dcb3..009c0737 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,14 +22,18 @@ def tiny_fasta_file(tmp_path): fasta_file = tmp_path / "tiny_fasta.fasta" with fasta_file.open("w+") as fasta_ref: fasta_ref.write( - ">foo\nMEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP" + ( + ">foo\nMEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRASQSVSSSYLTWYQ" + "QKPGQAPRLLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP" + ) ) return fasta_file @pytest.fixture def mgf_medium(tmp_path): - """An MGF file with 7 spectra and scan numbers, C+57.021 mass modification considered""" + """An MGF file with 7 spectra and scan numbers, + C+57.021 mass modification considered""" peptides = [ "ATSIPAR", "VTLSCR", From 6d0868c6141f3d329783139929c2db63928875e9 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Sun, 10 Nov 2024 13:42:57 +0100 Subject: [PATCH 58/84] Minor refactoring and type hint fixes --- casanovo/data/db_utils.py | 130 +++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 64 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index e704907c..b1121780 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -6,13 +6,14 @@ import re import string from collections import defaultdict -from typing import List, Tuple, Iterator +from typing import DefaultDict, Dict, Iterator, Pattern, Set, Tuple import depthcharge.masses +import numba as nb +import numpy as np import pandas as pd -import pyteomics.fasta as fasta -import pyteomics.parser as parser -from numba import njit +import pyteomics.fasta +import pyteomics.parser logger = logging.getLogger("casanovo") @@ -24,8 +25,8 @@ class ProteinDatabase: """ - Store digested .fasta data and return candidate peptides - for a given precursor mass. + Store digested FASTA data and return candidate peptides for a given + precursor mass. Parameters ---------- @@ -36,7 +37,7 @@ class ProteinDatabase: See pyteomics.parser.expasy_rules for valid enzymes. digestion : str The type of digestion to perform. - Either 'full', 'partial' or 'non-specific'. + Either 'full', 'partial', or 'non-specific'. missed_cleavages : int The number of missed cleavages to allow. min_peptide_len : int @@ -51,10 +52,10 @@ class ProteinDatabase: Isotope range [min, max] to consider when comparing predicted and observed precursor m/z's. allowed_fixed_mods : str - A comma separated string of fixed modifications to consider. + A comma-separated string of fixed modifications to consider. allowed_var_mods : str - A comma separated string of variable modifications to consider. - residues : dict[str, float] + A comma-separated string of variable modifications to consider. + residues : Dict[str, float] A dictionary of amino acid masses. """ @@ -71,7 +72,7 @@ def __init__( isotope_error: Tuple[int, int], allowed_fixed_mods: str, allowed_var_mods: str, - residues: dict[str, float], + residues: Dict[str, float], ): self.fixed_mods, self.var_mods, self.swap_map = _construct_mods_dict( allowed_fixed_mods, allowed_var_mods @@ -99,8 +100,8 @@ def get_candidates( charge: int, ) -> pd.Series: """ - Returns a list of candidate peptides that fall within the - specified mass range. + Returns candidate peptides that fall within the search + parameter's precursor mass tolerance. Parameters ---------- @@ -141,7 +142,7 @@ def get_candidates( def get_associated_protein(self, peptide: str) -> str: """ - Returns the associated protein for a given peptide. + Returns the associated protein(s) for a given peptide. Parameters ---------- @@ -151,17 +152,17 @@ def get_associated_protein(self, peptide: str) -> str: Returns ------- protein : str - The associated protein(s). + The associated protein(s) identifiers, separated by commas. """ return ",".join(self.prot_map[peptide]) def _digest_fasta( self, peptide_generator: Iterator[Tuple[str, str]], - ) -> Tuple[pd.DataFrame, dict[str, str]]: + ) -> Tuple[pd.DataFrame, DefaultDict[str, Set]]: """ Digests a FASTA file and returns the peptides, their masses, - and associated protein. + and associated protein(s). Parameters ---------- @@ -173,14 +174,14 @@ def _digest_fasta( pep_table : pd.DataFrame A Pandas DataFrame with peptide and mass columns. Sorted by neutral mass in ascending order. - prot_map : dict[str, str] + prot_map : DefaultDict[str, Set] A dictionary mapping peptides to associated proteins. """ - # Generate modified peptides + # Generate all possible peptide isoforms. mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") peptide_isoforms = [ ( - parser.isoforms( + pyteomics.parser.isoforms( pep, variable_mods=self.var_mods, fixed_mods=self.fixed_mods, @@ -203,12 +204,13 @@ def _digest_fasta( ) ] - # Create a dictionary mapping for easy accession of associated proteins - prot_map = defaultdict(set) + # Create a dictionary mapping for easy accession of associated + # proteins. + prot_map: DefaultDict[str, Set] = defaultdict(set) for pep, _, prot in mod_peptide_list: prot_map[pep].add(prot) - # Create a DataFrame for easy sorting and filtering + # Create a DataFrame for easy sorting and filtering. pep_table = pd.DataFrame( [(pep, mass) for pep, mass, _ in mod_peptide_list], columns=["peptide", "calc_mass"], @@ -230,11 +232,11 @@ def _peptide_generator( missed_cleavages: int, min_peptide_len: int, max_peptide_len: int, - valid_aa: set[str], -) -> Iterator[str]: + valid_aa: Set[str], +) -> Iterator[Tuple[str, str]]: """ - Create a generator the yields peptides from a FASTA file - depending on the type of digestion specified. + Creates a generator that yields peptides from a FASTA file depending + on the type of digestion specified. Parameters ---------- @@ -246,74 +248,73 @@ def _peptide_generator( Can also be a regex. digestion : str The type of digestion to perform. - Either 'full', 'partial' or 'non-specific'. + Either 'full', 'partial', or 'non-specific'. missed_cleavages : int The number of missed cleavages to allow. min_peptide_len : int The minimum length of peptides to consider. max_peptide_len : int The maximum length of peptides to consider. - valid_aa : set[str] + valid_aa : Set[str] A set of valid amino acids. Yields ------ - pep : str + peptide : str A peptide sequence, unmodified. protein : str The associated protein. """ - # Verify the existence of the file: + # Verify the existence of the file. if not os.path.isfile(fasta_filename): logger.error("File %s does not exist.", fasta_filename) raise FileNotFoundError(f"File {fasta_filename} does not exist.") - if digestion not in ["full", "partial", "non-specific"]: + if digestion not in ("full", "partial", "non-specific"): logger.error("Digestion type %s not recognized.", digestion) raise ValueError(f"Digestion type {digestion} not recognized.") - if enzyme not in parser.expasy_rules: + if enzyme not in pyteomics.parser.expasy_rules: logger.info( "Enzyme %s not recognized. Interpreting as cleavage rule.", enzyme, ) if digestion == "non-specific": - for header, seq in fasta.read(fasta_filename): + for header, seq in pyteomics.fasta.read(fasta_filename): protein = header.split()[0] - # Generate all possible peptides + # Generate all possible peptides. for i in range(len(seq)): for j in range( i + min_peptide_len, min(i + max_peptide_len + 1, len(seq) + 1), ): - pep = seq[i:j] - if any(aa not in valid_aa for aa in pep): + peptide = seq[i:j] + if any(aa not in valid_aa for aa in peptide): logger.warning( "Skipping peptide with unknown amino acids: %s", - pep, + peptide, ) else: - yield pep, protein + yield peptide, protein else: - semi = digestion == "partial" - for header, seq in fasta.read(fasta_filename): - pep_set = parser.cleave( + for header, seq in pyteomics.fasta.read(fasta_filename): + peptides = pyteomics.parser.cleave( seq, rule=enzyme, missed_cleavages=missed_cleavages, - semi=semi, + semi=digestion == "partial", ) protein = header.split()[0] - for pep in pep_set: - if len(pep) >= min_peptide_len and len(pep) <= max_peptide_len: - if any(aa not in valid_aa for aa in pep): + for peptide in peptides: + if min_peptide_len <= len(peptide) <= max_peptide_len: + if any(aa not in valid_aa for aa in peptide): logger.warning( "Skipping peptide with unknown amino acids: %s", - pep, + peptide, ) else: - yield pep, protein + yield peptide, protein -@njit +@nb.njit def _to_neutral_mass(mz_mass: float, charge: int) -> float: """ Convert precursor m/z value to neutral mass. @@ -334,7 +335,7 @@ def _to_neutral_mass(mz_mass: float, charge: int) -> float: def _convert_from_modx( - seq: str, swap_map: dict[str, str], swap_regex: str + seq: str, swap_map: dict[str, str], swap_regex: Pattern ) -> str: """ Converts peptide sequence from modX format to @@ -345,50 +346,51 @@ def _convert_from_modx( seq : str Peptide in modX format swap_map : dict[str, str] - Dictionary that allows for swapping of modX to Casanovo-acceptable modifications. - swap_regex : str + Dictionary that allows for swapping of modX to + Casanovo-acceptable modifications. + swap_regex : Pattern Regular expression to match modX format. Returns: -------- - swap_regex : str + str Peptide in Casanovo-acceptable modifications. """ + # FIXME: This might be handled by the DepthCharge residues vocabulary + # instead. return swap_regex.sub(lambda x: swap_map[x.group()], seq) def _construct_mods_dict( allowed_fixed_mods: str, allowed_var_mods: str -) -> Tuple[dict[str, str], dict[str, str], dict[str, str]]: +) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]: """ Constructs dictionaries of fixed and variable modifications. Parameters ---------- allowed_fixed_mods : str - A comma separated string of fixed modifications to consider. + A comma-separated string of fixed modifications to consider. allowed_var_mods : str - A comma separated string of variable modifications to consider. + A comma-separated string of variable modifications to consider. Returns ------- - fixed_mods : dict[str, str] + fixed_mods : Dict[str, str] A dictionary of fixed modifications. - var_mods : dict[str, str] + var_mods : Dict[str, str] A dictionary of variable modifications. - swap_map : dict[str, str] + swap_map : Dict[str, str] A dictionary that allows for swapping of modX to Casanovo-acceptable modifications. """ - swap_map = {} - fixed_mods = {} - var_mods = {} + swap_map, fixed_mods, var_mods = {}, {}, {} for mod_map, allowed_mods in zip( [fixed_mods, var_mods], [allowed_fixed_mods, allowed_var_mods] ): - for idx, mod in enumerate(allowed_mods.split(",")): + for i, mod in enumerate(allowed_mods.split(",")): aa, mod_aa = mod.split(":") - mod_id = string.ascii_lowercase[idx] + mod_id = string.ascii_lowercase[i] if aa == "nterm": mod_map[f"{mod_id}-"] = True swap_map[f"{mod_id}-"] = f"{mod_aa}" From 6ea037813fe5ab966e2d6e14497487d45dd7d1b7 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Sun, 10 Nov 2024 13:43:17 +0100 Subject: [PATCH 59/84] Use mask for more efficient candidate filtering --- casanovo/data/db_utils.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index b1121780..2e60663b 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -115,30 +115,22 @@ def get_candidates( candidates : pd.Series A series of candidate peptides. """ - candidates = [] - + # FIXME: This could potentially be sped up with only a single pass + # through the database. + mask = np.zeros(len(self.db_peptides), dtype=bool) + precursor_tol_ppm = self.precursor_tolerance / 1e6 for e in range(self.isotope_error[0], self.isotope_error[1] + 1): iso_shift = ISOTOPE_SPACING * e shift_raw_mass = float( _to_neutral_mass(precursor_mz, charge) - iso_shift ) - upper_bound = shift_raw_mass * ( - 1 + (self.precursor_tolerance / 1e6) - ) - lower_bound = shift_raw_mass * ( - 1 - (self.precursor_tolerance / 1e6) - ) - - window = self.db_peptides[ + upper_bound = shift_raw_mass * (1 + precursor_tol_ppm) + lower_bound = shift_raw_mass * (1 - precursor_tol_ppm) + mask |= ( (self.db_peptides["calc_mass"] >= lower_bound) & (self.db_peptides["calc_mass"] <= upper_bound) - ] - candidates.append(window[["peptide", "calc_mass"]]) - - candidates = pd.concat(candidates) - candidates.drop_duplicates(inplace=True) - candidates.sort_values(by=["calc_mass", "peptide"], inplace=True) - return candidates["peptide"] + ) + return self.db_peptides[mask]["peptide"] def get_associated_protein(self, peptide: str) -> str: """ From 408aa4d2359847cc8834ba70aef5b1179bb8ee91 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Sun, 10 Nov 2024 13:46:11 +0100 Subject: [PATCH 60/84] Reorder methods in logical order --- casanovo/data/db_utils.py | 208 +++++++++++++++++++------------------- 1 file changed, 104 insertions(+), 104 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 2e60663b..024452c8 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -94,60 +94,6 @@ def __init__( self.precursor_tolerance = precursor_tolerance self.isotope_error = isotope_error - def get_candidates( - self, - precursor_mz: float, - charge: int, - ) -> pd.Series: - """ - Returns candidate peptides that fall within the search - parameter's precursor mass tolerance. - - Parameters - ---------- - precursor_mz : float - The precursor mass-to-charge ratio. - charge : int - The precursor charge. - - Returns - ------- - candidates : pd.Series - A series of candidate peptides. - """ - # FIXME: This could potentially be sped up with only a single pass - # through the database. - mask = np.zeros(len(self.db_peptides), dtype=bool) - precursor_tol_ppm = self.precursor_tolerance / 1e6 - for e in range(self.isotope_error[0], self.isotope_error[1] + 1): - iso_shift = ISOTOPE_SPACING * e - shift_raw_mass = float( - _to_neutral_mass(precursor_mz, charge) - iso_shift - ) - upper_bound = shift_raw_mass * (1 + precursor_tol_ppm) - lower_bound = shift_raw_mass * (1 - precursor_tol_ppm) - mask |= ( - (self.db_peptides["calc_mass"] >= lower_bound) - & (self.db_peptides["calc_mass"] <= upper_bound) - ) - return self.db_peptides[mask]["peptide"] - - def get_associated_protein(self, peptide: str) -> str: - """ - Returns the associated protein(s) for a given peptide. - - Parameters - ---------- - peptide : str - The peptide sequence. - - Returns - ------- - protein : str - The associated protein(s) identifiers, separated by commas. - """ - return ",".join(self.prot_map[peptide]) - def _digest_fasta( self, peptide_generator: Iterator[Tuple[str, str]], @@ -216,6 +162,100 @@ def _digest_fasta( ) return pep_table, prot_map + def get_candidates( + self, + precursor_mz: float, + charge: int, + ) -> pd.Series: + """ + Returns candidate peptides that fall within the search + parameter's precursor mass tolerance. + + Parameters + ---------- + precursor_mz : float + The precursor mass-to-charge ratio. + charge : int + The precursor charge. + + Returns + ------- + candidates : pd.Series + A series of candidate peptides. + """ + # FIXME: This could potentially be sped up with only a single pass + # through the database. + mask = np.zeros(len(self.db_peptides), dtype=bool) + precursor_tol_ppm = self.precursor_tolerance / 1e6 + for e in range(self.isotope_error[0], self.isotope_error[1] + 1): + iso_shift = ISOTOPE_SPACING * e + shift_raw_mass = float( + _to_neutral_mass(precursor_mz, charge) - iso_shift + ) + upper_bound = shift_raw_mass * (1 + precursor_tol_ppm) + lower_bound = shift_raw_mass * (1 - precursor_tol_ppm) + mask |= ( + (self.db_peptides["calc_mass"] >= lower_bound) + & (self.db_peptides["calc_mass"] <= upper_bound) + ) + return self.db_peptides[mask]["peptide"] + + def get_associated_protein(self, peptide: str) -> str: + """ + Returns the associated protein(s) for a given peptide. + + Parameters + ---------- + peptide : str + The peptide sequence. + + Returns + ------- + protein : str + The associated protein(s) identifiers, separated by commas. + """ + return ",".join(self.prot_map[peptide]) + + +def _construct_mods_dict( + allowed_fixed_mods: str, allowed_var_mods: str +) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]: + """ + Constructs dictionaries of fixed and variable modifications. + + Parameters + ---------- + allowed_fixed_mods : str + A comma-separated string of fixed modifications to consider. + allowed_var_mods : str + A comma-separated string of variable modifications to consider. + + Returns + ------- + fixed_mods : Dict[str, str] + A dictionary of fixed modifications. + var_mods : Dict[str, str] + A dictionary of variable modifications. + swap_map : Dict[str, str] + A dictionary that allows for swapping of modX to + Casanovo-acceptable modifications. + """ + swap_map, fixed_mods, var_mods = {}, {}, {} + for mod_map, allowed_mods in zip( + [fixed_mods, var_mods], [allowed_fixed_mods, allowed_var_mods] + ): + for i, mod in enumerate(allowed_mods.split(",")): + aa, mod_aa = mod.split(":") + mod_id = string.ascii_lowercase[i] + if aa == "nterm": + mod_map[f"{mod_id}-"] = True + swap_map[f"{mod_id}-"] = f"{mod_aa}" + else: + mod_map[mod_id] = [aa] + swap_map[f"{mod_id}{aa}"] = f"{mod_aa}" + + return fixed_mods, var_mods, swap_map + def _peptide_generator( fasta_filename: str, @@ -306,26 +346,6 @@ def _peptide_generator( yield peptide, protein -@nb.njit -def _to_neutral_mass(mz_mass: float, charge: int) -> float: - """ - Convert precursor m/z value to neutral mass. - - Parameters - ---------- - mz_mass : float - The precursor mass-to-charge ratio. - charge : int - The precursor charge. - - Returns - ------- - mass : float - The calculated precursor neutral mass. - """ - return charge * (mz_mass - PROTON) - - def _convert_from_modx( seq: str, swap_map: dict[str, str], swap_regex: Pattern ) -> str: @@ -353,41 +373,21 @@ def _convert_from_modx( return swap_regex.sub(lambda x: swap_map[x.group()], seq) -def _construct_mods_dict( - allowed_fixed_mods: str, allowed_var_mods: str -) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]: +@nb.njit +def _to_neutral_mass(mz_mass: float, charge: int) -> float: """ - Constructs dictionaries of fixed and variable modifications. + Convert precursor m/z value to neutral mass. Parameters ---------- - allowed_fixed_mods : str - A comma-separated string of fixed modifications to consider. - allowed_var_mods : str - A comma-separated string of variable modifications to consider. + mz_mass : float + The precursor mass-to-charge ratio. + charge : int + The precursor charge. Returns ------- - fixed_mods : Dict[str, str] - A dictionary of fixed modifications. - var_mods : Dict[str, str] - A dictionary of variable modifications. - swap_map : Dict[str, str] - A dictionary that allows for swapping of modX to - Casanovo-acceptable modifications. + mass : float + The calculated precursor neutral mass. """ - swap_map, fixed_mods, var_mods = {}, {}, {} - for mod_map, allowed_mods in zip( - [fixed_mods, var_mods], [allowed_fixed_mods, allowed_var_mods] - ): - for i, mod in enumerate(allowed_mods.split(",")): - aa, mod_aa = mod.split(":") - mod_id = string.ascii_lowercase[i] - if aa == "nterm": - mod_map[f"{mod_id}-"] = True - swap_map[f"{mod_id}-"] = f"{mod_aa}" - else: - mod_map[mod_id] = [aa] - swap_map[f"{mod_id}{aa}"] = f"{mod_aa}" - - return fixed_mods, var_mods, swap_map + return charge * (mz_mass - PROTON) From 65189ee142ea686d1f148da000a684a88bacdbea Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Sun, 10 Nov 2024 13:50:50 +0100 Subject: [PATCH 61/84] Fix unit tests --- tests/unit_tests/test_unit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 034f4874..a863b1f7 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -699,10 +699,10 @@ def test_digest_fasta_mods(tiny_fasta_file, residues_dict): "+43.006ASQSVSSSYLTWYQQKPGQAPR", "-17.027FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "FSGSGSGTDFTLTISSLQ+0.984PEDFAVYYC+57.021QQDYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021Q+0.984QDYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQ+0.984DYNLP", "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYN+0.984LP", - "FSGSGSGTDFTLTISSLQ+0.984PEDFAVYYC+57.021QQDYNLP", "+43.006-17.027FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "+42.011FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "+43.006FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", @@ -873,8 +873,8 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): "QQKPGQ", "SLQPED", "PAQLLF", - "SIPARF", "IPARFS", + "SIPARF", "LSC+57.021RAS", "TDFTLT", "QAPRLL", From 1efd9dda358d1b84df9028bb2fa2654466ff8c53 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Sun, 10 Nov 2024 14:11:38 +0100 Subject: [PATCH 62/84] Directly generate DB peptides as DataFrame --- casanovo/data/db_utils.py | 81 ++++++++++++++++------------------- tests/unit_tests/test_unit.py | 29 +++++-------- 2 files changed, 46 insertions(+), 64 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 024452c8..55127cff 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -5,8 +5,7 @@ import os import re import string -from collections import defaultdict -from typing import DefaultDict, Dict, Iterator, Pattern, Set, Tuple +from typing import Dict, Iterator, Pattern, Set, Tuple import depthcharge.masses import numba as nb @@ -90,14 +89,14 @@ def __init__( max_peptide_len, set([aa[0] for aa in residues.keys() if aa[0].isalpha()]), ) - self.db_peptides, self.prot_map = self._digest_fasta(peptide_generator) + self.db_peptides = self._digest_fasta(peptide_generator) self.precursor_tolerance = precursor_tolerance self.isotope_error = isotope_error def _digest_fasta( self, peptide_generator: Iterator[Tuple[str, str]], - ) -> Tuple[pd.DataFrame, DefaultDict[str, Set]]: + ) -> pd.DataFrame: """ Digests a FASTA file and returns the peptides, their masses, and associated protein(s). @@ -109,58 +108,50 @@ def _digest_fasta( Returns ------- - pep_table : pd.DataFrame - A Pandas DataFrame with peptide and mass columns. - Sorted by neutral mass in ascending order. - prot_map : DefaultDict[str, Set] - A dictionary mapping peptides to associated proteins. + peptides : pd.DataFrame + A Pandas DataFrame with index "peptide" (the peptide + sequence), and columns "calc_mass" (the peptide neutral + mass) and "protein" (a list of associated protein(s)). """ # Generate all possible peptide isoforms. - mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") - peptide_isoforms = [ - ( - pyteomics.parser.isoforms( + peptides = pd.DataFrame( + data=[ + (iso, prot) + for pep, prot in peptide_generator + for iso in pyteomics.parser.isoforms( pep, variable_mods=self.var_mods, fixed_mods=self.fixed_mods, max_mods=self.max_mods, - ), - prot, - ) - for pep, prot in peptide_generator - ] - mod_peptide_list = [ - (mod_pep, mass_calculator.mass(mod_pep), prot) - for isos, prot in peptide_isoforms - for mod_pep in map( - functools.partial( - _convert_from_modx, - swap_map=self.swap_map, - swap_regex=self.swap_regex, - ), - isos, + ) + ], + columns=["peptide", "protein"], + ) + # Convert modX peptide to Casanovo format. + peptides["peptide"] = peptides["peptide"].apply( + functools.partial( + _convert_from_modx, + swap_map=self.swap_map, + swap_regex=self.swap_regex, ) - ] - - # Create a dictionary mapping for easy accession of associated - # proteins. - prot_map: DefaultDict[str, Set] = defaultdict(set) - for pep, _, prot in mod_peptide_list: - prot_map[pep].add(prot) - - # Create a DataFrame for easy sorting and filtering. - pep_table = pd.DataFrame( - [(pep, mass) for pep, mass, _ in mod_peptide_list], - columns=["peptide", "calc_mass"], ) - pep_table.sort_values( + # Merge proteins from duplicate peptides. + peptides = peptides.groupby("peptide")["protein"].apply( + lambda proteins: sorted(set(proteins)) + ).reset_index() + # Calculate the mass of each peptide. + mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") + peptides["calc_mass"] = peptides["peptide"].apply(mass_calculator.mass) + # Sort by peptide mass and index by peptide sequence. + peptides.sort_values( by=["calc_mass", "peptide"], ascending=True, inplace=True ) + peptides.set_index("peptide", inplace=True) logger.info( - "Digestion complete. %d peptides generated.", len(pep_table) + "Digestion complete. %d peptides generated.", len(peptides) ) - return pep_table, prot_map + return peptides def get_candidates( self, @@ -198,7 +189,7 @@ def get_candidates( (self.db_peptides["calc_mass"] >= lower_bound) & (self.db_peptides["calc_mass"] <= upper_bound) ) - return self.db_peptides[mask]["peptide"] + return self.db_peptides.index[mask] def get_associated_protein(self, peptide: str) -> str: """ @@ -214,7 +205,7 @@ def get_associated_protein(self, peptide: str) -> str: protein : str The associated protein(s) identifiers, separated by commas. """ - return ",".join(self.prot_map[peptide]) + return ",".join(self.db_peptides.loc[peptide, "protein"]) def _construct_mods_dict( diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index a863b1f7..0d4812f9 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -645,8 +645,7 @@ def test_digest_fasta_cleave(tiny_fasta_file, residues_dict): ), residues=residues_dict, ) - peptide_list = list(pdb.db_peptides["peptide"]) - assert peptide_list == expected + assert pdb.db_peptides.index.to_list() == expected def test_digest_fasta_mods(tiny_fasta_file, residues_dict): @@ -724,8 +723,7 @@ def test_digest_fasta_mods(tiny_fasta_file, residues_dict): ), residues=residues_dict, ) - peptide_list = list(pdb.db_peptides["peptide"]) - assert peptide_list == expected_1mod + assert pdb.db_peptides.index.to_list() == expected_1mod def test_length_restrictions(tiny_fasta_file, residues_dict): @@ -756,8 +754,7 @@ def test_length_restrictions(tiny_fasta_file, residues_dict): ), residues=residues_dict, ) - peptide_list = list(pdb.db_peptides["peptide"]) - assert peptide_list == expected_long + assert pdb.db_peptides.index.to_list() == expected_long pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), @@ -776,8 +773,7 @@ def test_length_restrictions(tiny_fasta_file, residues_dict): ), residues=residues_dict, ) - peptide_list = list(pdb.db_peptides["peptide"]) - assert peptide_list == expected_short + assert pdb.db_peptides.index.to_list() == expected_short def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): @@ -942,8 +938,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): ), residues=residues_dict, ) - peptide_list = list(pdb.db_peptides["peptide"]) - assert peptide_list == expected_argc + assert pdb.db_peptides.index.to_list() == expected_argc pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), @@ -962,8 +957,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): ), residues=residues_dict, ) - peptide_list = list(pdb.db_peptides["peptide"]) - assert peptide_list == expected_aspn + assert pdb.db_peptides.index.to_list() == expected_aspn # Test regex rule instead of named enzyme pdb = db_utils.ProteinDatabase( @@ -983,8 +977,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): ), residues=residues_dict, ) - peptide_list = list(pdb.db_peptides["peptide"]) - assert peptide_list == expected_argc + assert pdb.db_peptides.index.to_list() == expected_argc # Test semispecific digest pdb = db_utils.ProteinDatabase( @@ -1004,8 +997,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): ), residues=residues_dict, ) - peptide_list = list(pdb.db_peptides["peptide"]) - assert peptide_list == expected_semispecific + assert pdb.db_peptides.index.to_list() == expected_semispecific # Test nonspecific digest pdb = db_utils.ProteinDatabase( @@ -1025,8 +1017,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): ), residues=residues_dict, ) - peptide_list = list(pdb.db_peptides["peptide"]) - assert peptide_list == expected_nonspecific + assert pdb.db_peptides.index.to_list() == expected_nonspecific def test_get_candidates(tiny_fasta_file, residues_dict): @@ -1139,7 +1130,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): peptide_list = pd.DataFrame( peptide_list, columns=["peptide", "calc_mass", "protein"] - ) + ).set_index("peptide") peptide_list.sort_values("calc_mass", inplace=True) expected_isotope0 = list("UTSRQPONMLKJIHGFEDCB") From f679cdc595d6a0a65e0a6f8cdefebc0909fe391f Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Sun, 10 Nov 2024 14:23:41 +0100 Subject: [PATCH 63/84] Fix type hints and line lengths --- casanovo/casanovo.py | 142 +++++++++++++++++++++++-------------------- casanovo/utils.py | 14 ++--- 2 files changed, 82 insertions(+), 74 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 01098255..17786793 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -12,7 +12,7 @@ import urllib.parse import warnings from pathlib import Path -from typing import Optional, Tuple, List +from typing import Optional, Tuple warnings.formatwarning = lambda message, category, *args, **kwargs: ( f"{category.__name__}: {message}" @@ -62,19 +62,19 @@ def __init__(self, *args, **kwargs) -> None: click.Option( ("-m", "--model"), help=""" - Either the model weights (.ckpt file) or a URL pointing to - the model weights file. If not provided, - Casanovo will try to download the latest release automatically. + Either the model weights (.ckpt file) or a URL pointing to the + model weights file. If not provided, Casanovo will try to + download the latest release automatically. """, ), click.Option( ("-d", "--output_dir"), - help="The destination directory for output files", + help="The destination directory for output files.", type=click.Path(dir_okay=True), ), click.Option( ("-o", "--output_root"), - help="The root name for all output files", + help="The root name for all output files.", type=click.Path(dir_okay=False), ), click.Option( @@ -113,9 +113,9 @@ def main() -> None: ======== Casanovo de novo sequences peptides from tandem mass spectra using a - Transformer model. Casanovo currently supports mzML, mzXML, and MGF files - for de novo sequencing and annotated MGF files, such as those from - MassIVE-KB, for training new models. + Transformer model. Casanovo currently supports mzML, mzXML, and MGF + files for de novo sequencing and annotated MGF files, such as those + from MassIVE-KB, for training new models. Links: @@ -124,10 +124,10 @@ def main() -> None: If you use Casanovo in your work, please cite: - - Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo - mass spectrometry peptide sequencing with a transformer model. Proceedings - of the 39th International Conference on Machine Learning - ICML '22 (2022) - doi:10.1101/2022.02.07.479481. + - Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. + De novo mass spectrometry peptide sequencing with a transformer + model. Proceedings of the 39th International Conference on Machine + Learning - ICML '22 (2022) doi:10.1101/2022.02.07.479481. """ @@ -147,9 +147,9 @@ def main() -> None: is_flag=True, default=False, help=""" - Run in evaluation mode. When this flag is set the peptide and amino - acid precision will be calculated and logged at the end of the sequencing - run. All input files must be annotated MGF files if running in evaluation + Run in evaluation mode. When this flag is set the peptide and amino acid + precision will be calculated and logged at the end of the sequencing run. + All input files must be annotated MGF files if running in evaluation mode. """, ) @@ -290,8 +290,9 @@ def train( ) -> None: """Train a Casanovo model on your own data. - TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those - provided by MassIVE-KB, from which to train a new Casnovo model. + TRAIN_PEAK_PATH must be one or more annoated MGF files, such as + those provided by MassIVE-KB, from which to train a new Casnovo + model. """ output_path, output_root_name = _setup_output( output_dir, output_root, force_overwrite, verbosity @@ -324,7 +325,7 @@ def train( @main.command() def version() -> None: - """Get the Casanovo version information""" + """Get the Casanovo version information.""" versions = [ f"Casanovo: {__version__}", f"Depthcharge: {depthcharge.__version__}", @@ -342,20 +343,20 @@ def version() -> None: default="casanovo.yaml", type=click.Path(dir_okay=False), ) -def configure(output: str) -> None: +def configure(output: Path) -> None: """Generate a Casanovo configuration file to customize. The casanovo configuration file is in the YAML format. """ - Config.copy_default(output) - output = setup_logging(output, "info") + Config.copy_default(str(output)) + setup_logging(output, "info") logger.info(f"Wrote {output}\n") def setup_logging( log_file_path: Path, verbosity: str, -) -> Path: +) -> None: """Set up the logger. Logging occurs to the command-line and to the given log file. @@ -423,10 +424,11 @@ def setup_model( Parameters ---------- model : str | None - May be a file system path, a URL pointing to a .ckpt file, or None. - If `model` is a URL the weights will be downloaded and cached from - `model`. If `model` is `None` the weights from the latest matching - official release will be used (downloaded and cached). + May be a file system path, a URL pointing to a .ckpt file, or + None. If `model` is a URL the weights will be downloaded and + cached from `model`. If `model` is `None` the weights from the + latest matching official release will be used (downloaded and + cached). config : str | None Config file path. If None the default config will be used. output_dir: : Path | str @@ -434,20 +436,21 @@ def setup_model( output_root_name : str, The base name for the output files. is_train : bool - Are we training? If not, we need to retrieve weights when the model is - None. + Are we training? If not, we need to retrieve weights when the + model is None. Return ------ Tuple[Config, Path] - Initialized Casanovo config, local path to model weights if any (may be - `None` if training using random starting weights). + Initialized Casanovo config, local path to model weights if any + (may be `None` if training using random starting weights). """ # Read parameters from the config file. config = Config(config) seed_everything(seed=config["random_seed"], workers=True) - # Download model weights if these were not specified (except when training). + # Download model weights if these were not specified (except when + # training). cache_dir = Path(appdirs.user_cache_dir("casanovo", False, opinion=False)) if model is None: if not is_train: @@ -455,16 +458,16 @@ def setup_model( model = _get_model_weights(cache_dir) except github.RateLimitExceededException: logger.error( - "GitHub API rate limit exceeded while trying to download the " - "model weights. Please download compatible model weights " - "manually from the official Casanovo code website " - "(https://github.com/Noble-Lab/casanovo) and specify these " - "explicitly using the `--model` parameter when running " - "Casanovo." + "GitHub API rate limit exceeded while trying to download " + "the model weights. Please download compatible model " + "weights manually from the official Casanovo code website " + "(https://github.com/Noble-Lab/casanovo) and specify " + "these explicitly using the `--model` parameter when " + "running Casanovo." ) raise PermissionError( - "GitHub API rate limit exceeded while trying to download the " - "model weights" + "GitHub API rate limit exceeded while trying to download " + "the model weights" ) from None else: if _is_valid_url(model): @@ -489,29 +492,30 @@ def setup_model( return config, model -def _get_model_weights(cache_dir: Path) -> str: +def _get_model_weights(cache_dir: Path) -> Path: """ Use cached model weights or download them from GitHub. - If no weights file (extension: .ckpt) is available in the cache directory, - it will be downloaded from a release asset on GitHub. - Model weights are retrieved by matching release version. If no model weights - for an identical release (major, minor, patch), alternative releases with - matching (i) major and minor, or (ii) major versions will be used. - If no matching release can be found, no model weights will be downloaded. + If no weights file (extension: .ckpt) is available in the cache + directory, it will be downloaded from a release asset on GitHub. + Model weights are retrieved by matching release version. If no model + weights for an identical release (major, minor, patch), alternative + releases with matching (i) major and minor, or (ii) major versions + will be used. If no matching release can be found, no model weights + will be downloaded. - Note that the GitHub API is limited to 60 requests from the same IP per - hour. + Note that the GitHub API is limited to 60 requests from the same IP + per hour. Parameters ---------- cache_dir : Path - model weights cache directory path + Model weights cache directory path. Returns ------- - str - The name of the model weights file. + Path + The path of the model weights file. """ os.makedirs(cache_dir, exist_ok=True) version = utils.split_version(__version__) @@ -598,11 +602,11 @@ def _setup_output( Parameters: ----------- output_dir : str | None - The path to the output directory. If `None`, the output directory will - be resolved to the current working directory. + The path to the output directory. If `None`, the output + directory will be resolved to the current working directory. output_root : str | None - The base name for the output files. If `None` the output root name will - be resolved to casanovo_ + The base name for the output files. If `None` the output root + name will be resolved to casanovo_ overwrite: bool Whether to overwrite log file if it already exists in the output directory. @@ -612,8 +616,8 @@ def _setup_output( Returns: -------- Tuple[Path, str] - A tuple containing the resolved output directory and root name for - output files. + A tuple containing the resolved output directory and root name + for output files. """ if output_root is None: output_root = ( @@ -627,7 +631,8 @@ def _setup_output( if not output_path.is_dir(): output_path.mkdir(parents=True) logger.warning( - "Target output directory %s does not exists, so it will be created.", + "Target output directory %s does not exists, so it will be " + "created.", output_path, ) @@ -647,8 +652,8 @@ def _get_weights_from_url( Resolve weight file from URL Attempt to download weight file from URL if weights are not already - cached - otherwise use cached weights. Downloaded weight files will be - cached. + cached - otherwise use cached weights. Downloaded weight files will + be cached. Parameters ---------- @@ -657,8 +662,8 @@ def _get_weights_from_url( cache_dir : Path Model weights cache directory path. force_download : Optional[bool], default=False - If True, forces a new download of the weight file even if it exists in - the cache. + If True, forces a new download of the weight file even if it + exists in the cache. Returns ------- @@ -688,7 +693,8 @@ def _get_weights_from_url( ).timestamp() else: logger.warning( - "Attempted HEAD request to %s yielded non-ok status code - using cached file", + "Attempted HEAD request to %s yielded non-ok status code—" + "using cached file", file_url, ) except ( @@ -697,7 +703,8 @@ def _get_weights_from_url( requests.TooManyRedirects, ): logger.warning( - "Failed to reach %s to get remote last modified time - using cached file", + "Failed to reach %s to get remote last modified time—using " + "cached file", file_url, ) @@ -715,8 +722,9 @@ def _download_weights(file_url: str, download_path: Path) -> None: """ Download weights file from URL - Download the model weights file from the specified URL and save it to the - given path. Ensures the download directory exists, and uses a progress + Download the model weights file from the specified URL and save it + to the given path. Ensures the download directory exists, and uses a + progress bar to indicate download status. Parameters diff --git a/casanovo/utils.py b/casanovo/utils.py index 43b1cb7d..6e4273e3 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -161,16 +161,16 @@ def get_report_dict( def log_run_report( - start_time: Optional[int] = None, end_time: Optional[int] = None + start_time: Optional[float] = None, end_time: Optional[float] = None ) -> None: """ Log general run report Parameters ---------- - start_time : Optional[int], default=None + start_time : Optional[float], default=None The start time of the sequencing run in seconds since the epoch. - end_time : Optional[int], default=None + end_time : Optional[float], default=None The end time of the sequencing run in seconds since the epoch. """ logger.info("======= End of Run Report =======") @@ -197,8 +197,8 @@ def log_run_report( def log_sequencing_report( predictions: List[PepSpecMatch], - start_time: Optional[int] = None, - end_time: Optional[int] = None, + start_time: Optional[float] = None, + end_time: Optional[float] = None, score_bins: List[float] = SCORE_BINS, ) -> None: """ @@ -210,9 +210,9 @@ def log_sequencing_report( str, Tuple[str, str], float, float, float, float, str ] PSM predictions - start_time : Optional[int], default=None + start_time : Optional[float], default=None The start time of the sequencing run in seconds since the epoch. - end_time : Optional[int], default=None + end_time : Optional[float], default=None The end time of the sequencing run in seconds since the epoch. score_bins: List[float], Optional Confidence scores for creating confidence score distribution, From c07ef5773eb238a94789545ca8188496cd2787a2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 10 Nov 2024 13:28:11 +0000 Subject: [PATCH 64/84] Generate new screengrabs with rich-codex --- docs/images/help.svg | 162 ++++++++++++------------- docs/images/sequence-help.svg | 216 +++++++++++++++++----------------- docs/images/train-help.svg | 204 ++++++++++++++++---------------- 3 files changed, 291 insertions(+), 291 deletions(-) diff --git a/docs/images/help.svg b/docs/images/help.svg index 5418b95a..d25376e4 100644 --- a/docs/images/help.svg +++ b/docs/images/help.svg @@ -19,132 +19,132 @@ font-weight: 700; } - .terminal-782331977-matrix { + .terminal-1140158551-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-782331977-title { + .terminal-1140158551-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-782331977-r1 { fill: #c5c8c6 } -.terminal-782331977-r2 { fill: #d0b344 } -.terminal-782331977-r3 { fill: #c5c8c6;font-weight: bold } -.terminal-782331977-r4 { fill: #68a0b3;font-weight: bold } -.terminal-782331977-r5 { fill: #d0b344;font-weight: bold } -.terminal-782331977-r6 { fill: #868887 } -.terminal-782331977-r7 { fill: #98a84b;font-weight: bold } + .terminal-1140158551-r1 { fill: #c5c8c6 } +.terminal-1140158551-r2 { fill: #d0b344 } +.terminal-1140158551-r3 { fill: #c5c8c6;font-weight: bold } +.terminal-1140158551-r4 { fill: #68a0b3;font-weight: bold } +.terminal-1140158551-r5 { fill: #d0b344;font-weight: bold } +.terminal-1140158551-r6 { fill: #868887 } +.terminal-1140158551-r7 { fill: #98a84b;font-weight: bold } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -156,44 +156,44 @@ - + - - $ casanovo --help - -Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     - - ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  - ┃                                  Casanovo                                  ┃  - ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  - Casanovo de novo sequences peptides from tandem mass spectra using a            - Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   - de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  - training new models.                                                            - - Links:                                                                          - - • Documentation: https://casanovo.readthedocs.io                               - • Official code repository: https://github.com/Noble-Lab/casanovo              - - If you use Casanovo in your work, please cite:                                  - - • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   -mass spectrometry peptide sequencing with a transformer model. Proceedings   -of the 39th International Conference on Machine Learning - ICML '22 (2022)   -doi:10.1101/2022.02.07.479481.                                               - -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---help-h    Show this message and exit.                                     -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Commands ───────────────────────────────────────────────────────────────────╮ -configure Generate a Casanovo configuration file to customize.               -db-search Perform a database search on MS/MS data using Casanovo-DB.         -sequence  De novo sequence peptides from tandem mass spectra.                -train     Train a Casanovo model on your own data.                           -version   Get the Casanovo version information                               -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo --help + +Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     + + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  + ┃                                  Casanovo                                  ┃  + ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  + Casanovo de novo sequences peptides from tandem mass spectra using a            + Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   + de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  + training new models.                                                            + + Links:                                                                          + + • Documentation: https://casanovo.readthedocs.io                               + • Official code repository: https://github.com/Noble-Lab/casanovo              + + If you use Casanovo in your work, please cite:                                  + + • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   +mass spectrometry peptide sequencing with a transformer model. Proceedings   +of the 39th International Conference on Machine Learning - ICML '22 (2022)   +doi:10.1101/2022.02.07.479481.                                               + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--help-h    Show this message and exit.                                     +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Commands ───────────────────────────────────────────────────────────────────╮ +configure Generate a Casanovo configuration file to customize.               +db-search Perform a database search on MS/MS data using Casanovo-DB.         +sequence  De novo sequence peptides from tandem mass spectra.                +train     Train a Casanovo model on your own data.                           +version   Get the Casanovo version information.                              +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg index ea6ff078..6354851d 100644 --- a/docs/images/sequence-help.svg +++ b/docs/images/sequence-help.svg @@ -19,171 +19,171 @@ font-weight: 700; } - .terminal-3610042700-matrix { + .terminal-3608076648-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-3610042700-title { + .terminal-3608076648-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-3610042700-r1 { fill: #c5c8c6 } -.terminal-3610042700-r2 { fill: #d0b344 } -.terminal-3610042700-r3 { fill: #c5c8c6;font-weight: bold } -.terminal-3610042700-r4 { fill: #68a0b3;font-weight: bold } -.terminal-3610042700-r5 { fill: #868887 } -.terminal-3610042700-r6 { fill: #cc555a } -.terminal-3610042700-r7 { fill: #d0b344;font-weight: bold } -.terminal-3610042700-r8 { fill: #8a4346 } -.terminal-3610042700-r9 { fill: #98a84b;font-weight: bold } -.terminal-3610042700-r10 { fill: #8d7b39;font-weight: bold } + .terminal-3608076648-r1 { fill: #c5c8c6 } +.terminal-3608076648-r2 { fill: #d0b344 } +.terminal-3608076648-r3 { fill: #c5c8c6;font-weight: bold } +.terminal-3608076648-r4 { fill: #68a0b3;font-weight: bold } +.terminal-3608076648-r5 { fill: #868887 } +.terminal-3608076648-r6 { fill: #cc555a } +.terminal-3608076648-r7 { fill: #d0b344;font-weight: bold } +.terminal-3608076648-r8 { fill: #8a4346 } +.terminal-3608076648-r9 { fill: #98a84b;font-weight: bold } +.terminal-3608076648-r10 { fill: #8d7b39;font-weight: bold } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -195,56 +195,56 @@ - + - - $ casanovo sequence --help - -Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 - - De novo sequence peptides from tandem mass spectra.                             - PEAK_PATH must be one or more mzML, mzXML, or MGF files from which to sequence  - peptides. If evaluate is set to True PEAK_PATH must be one or more annotated    - MGF file.                                                                       - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---evaluate-e  Run in evaluation mode.     -                                                   When this flag is set the   -                                                   peptide and amino acid      -                                                   precision will be           -                                                   calculated and logged at    -                                                   the end of the sequencing   -                                                   run. All input files must   -                                                   be annotated MGF files if   -                                                   running in evaluation       -                                                   mode.                       ---model-mTEXT                       Either the model weights    -                                                   (.ckpt file) or a URL       -                                                   pointing to the model       -                                                   weights file. If not        -                                                   provided, Casanovo will     -                                                   try to download the latest  -                                                   release automatically.      ---output_dir-dPATH                       The destination directory   -                                                   for output files            ---output_root-oFILE                       The root name for all       -                                                   output files                ---config-cFILE                       The YAML configuration      -                                                   file overriding the         -                                                   default options.            ---verbosity-v[debug|info|warning|error  Set the verbosity of        -]  console logging messages.   -                                                   Log files are always set    -                                                   to 'debug'.                 ---force_overwrite-f  Whether to overwrite        -                                                   output files.               ---help-h  Show this message and       -                                                   exit.                       -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo sequence --help + +Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 + + De novo sequence peptides from tandem mass spectra.                             + PEAK_PATH must be one or more mzML, mzXML, or MGF files from which to sequence  + peptides. If evaluate is set to True PEAK_PATH must be one or more annotated    + MGF file.                                                                       + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--evaluate-e  Run in evaluation mode.     +                                                   When this flag is set the   +                                                   peptide and amino acid      +                                                   precision will be           +                                                   calculated and logged at    +                                                   the end of the sequencing   +                                                   run. All input files must   +                                                   be annotated MGF files if   +                                                   running in evaluation       +                                                   mode.                       +--model-mTEXT                       Either the model weights    +                                                   (.ckpt file) or a URL       +                                                   pointing to the model       +                                                   weights file. If not        +                                                   provided, Casanovo will     +                                                   try to download the latest  +                                                   release automatically.      +--output_dir-dPATH                       The destination directory   +                                                   for output files.           +--output_root-oFILE                       The root name for all       +                                                   output files.               +--config-cFILE                       The YAML configuration      +                                                   file overriding the         +                                                   default options.            +--verbosity-v[debug|info|warning|error  Set the verbosity of        +]  console logging messages.   +                                                   Log files are always set    +                                                   to 'debug'.                 +--force_overwrite-f  Whether to overwrite        +                                                   output files.               +--help-h  Show this message and       +                                                   exit.                       +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/train-help.svg b/docs/images/train-help.svg index 783a0660..8aab62d4 100644 --- a/docs/images/train-help.svg +++ b/docs/images/train-help.svg @@ -19,162 +19,162 @@ font-weight: 700; } - .terminal-2920970231-matrix { + .terminal-3079567379-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-2920970231-title { + .terminal-3079567379-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-2920970231-r1 { fill: #c5c8c6 } -.terminal-2920970231-r2 { fill: #d0b344 } -.terminal-2920970231-r3 { fill: #c5c8c6;font-weight: bold } -.terminal-2920970231-r4 { fill: #68a0b3;font-weight: bold } -.terminal-2920970231-r5 { fill: #868887 } -.terminal-2920970231-r6 { fill: #cc555a } -.terminal-2920970231-r7 { fill: #d0b344;font-weight: bold } -.terminal-2920970231-r8 { fill: #8a4346 } -.terminal-2920970231-r9 { fill: #98a84b;font-weight: bold } -.terminal-2920970231-r10 { fill: #8d7b39;font-weight: bold } + .terminal-3079567379-r1 { fill: #c5c8c6 } +.terminal-3079567379-r2 { fill: #d0b344 } +.terminal-3079567379-r3 { fill: #c5c8c6;font-weight: bold } +.terminal-3079567379-r4 { fill: #68a0b3;font-weight: bold } +.terminal-3079567379-r5 { fill: #868887 } +.terminal-3079567379-r6 { fill: #cc555a } +.terminal-3079567379-r7 { fill: #d0b344;font-weight: bold } +.terminal-3079567379-r8 { fill: #8a4346 } +.terminal-3079567379-r9 { fill: #98a84b;font-weight: bold } +.terminal-3079567379-r10 { fill: #8d7b39;font-weight: bold } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -186,53 +186,53 @@ - + - - $ casanovo train --help - -Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              - - Train a Casanovo model on your own data.                                        - TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  - by MassIVE-KB, from which to train a new Casnovo model.                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  TRAIN_PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---validation_peak_path-pFILE                    An annotated MGF file     -                                                     for validation, like      -                                                     from MassIVE-KB. Use      -                                                     this option multiple      -                                                     times to specify          -                                                     multiple files.           ---model-mTEXT                    Either the model weights  -                                                     (.ckpt file) or a URL     -                                                     pointing to the model     -                                                     weights file. If not      -                                                     provided, Casanovo will   -                                                     try to download the       -                                                     latest release            -                                                     automatically.            ---output_dir-dPATH                    The destination           -                                                     directory for output      -                                                     files                     ---output_root-oFILE                    The root name for all     -                                                     output files              ---config-cFILE                    The YAML configuration    -                                                     file overriding the       -                                                     default options.          ---verbosity-v[debug|info|warning|er  Set the verbosity of      -ror]  console logging           -                                                     messages. Log files are   -                                                     always set to 'debug'.    ---force_overwrite-f  Whether to overwrite      -                                                     output files.             ---help-h  Show this message and     -                                                     exit.                     -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo train --help + +Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              + + Train a Casanovo model on your own data.                                        + TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  + by MassIVE-KB, from which to train a new Casnovo model.                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  TRAIN_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--validation_peak_path-pFILE                    An annotated MGF file     +                                                     for validation, like      +                                                     from MassIVE-KB. Use      +                                                     this option multiple      +                                                     times to specify          +                                                     multiple files.           +--model-mTEXT                    Either the model weights  +                                                     (.ckpt file) or a URL     +                                                     pointing to the model     +                                                     weights file. If not      +                                                     provided, Casanovo will   +                                                     try to download the       +                                                     latest release            +                                                     automatically.            +--output_dir-dPATH                    The destination           +                                                     directory for output      +                                                     files.                    +--output_root-oFILE                    The root name for all     +                                                     output files.             +--config-cFILE                    The YAML configuration    +                                                     file overriding the       +                                                     default options.          +--verbosity-v[debug|info|warning|er  Set the verbosity of      +ror]  console logging           +                                                     messages. Log files are   +                                                     always set to 'debug'.    +--force_overwrite-f  Whether to overwrite      +                                                     output files.             +--help-h  Show this message and     +                                                     exit.                     +╰──────────────────────────────────────────────────────────────────────────────╯ + From 09ffdfb6ebe5dba1b9b52c1b6c6462a2a76a8753 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Sun, 10 Nov 2024 14:52:30 +0100 Subject: [PATCH 65/84] Refactor batching to avoid code repetition --- casanovo/denovo/dataloaders.py | 150 ++++++++++++++++----------------- 1 file changed, 72 insertions(+), 78 deletions(-) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 4eb4d2e2..f929b1e0 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -11,10 +11,7 @@ from depthcharge.data import AnnotatedSpectrumIndex from ..data import db_utils -from ..data.datasets import ( - AnnotatedSpectrumDataset, - SpectrumDataset, -) +from ..data.datasets import AnnotatedSpectrumDataset, SpectrumDataset logger = logging.getLogger("casanovo") @@ -37,25 +34,25 @@ class DeNovoDataModule(pl.LightningDataModule): eval_batch_size : int The batch size to use for inference. n_peaks : Optional[int] - The number of top-n most intense peaks to keep in each spectrum. `None` - retains all peaks. + The number of top-n most intense peaks to keep in each spectrum. + `None` retains all peaks. min_mz : float - The minimum m/z to include. The default is 140 m/z, in order to exclude - TMT and iTRAQ reporter ions. + The minimum m/z to include. The default is 140 m/z, in order to + exclude TMT and iTRAQ reporter ions. max_mz : float The maximum m/z to include. min_intensity : float - Remove peaks whose intensity is below `min_intensity` percentage of the - base peak intensity. + Remove peaks whose intensity is below `min_intensity` percentage + of the base peak intensity. remove_precursor_tol : float - Remove peaks within the given mass tolerance in Dalton around the - precursor mass. + Remove peaks within the given mass tolerance in Dalton around + the precursor mass. n_workers : int, optional - The number of workers to use for data loading. By default, the number of - available CPU cores on the current machine is used. + The number of workers to use for data loading. By default, the + number of available CPU cores on the current machine is used. random_state : Optional[int] - The NumPy random state. ``None`` leaves mass spectra in the order they - were parsed. + The NumPy random state. ``None`` leaves mass spectra in the + order they were parsed. """ def __init__( @@ -74,12 +71,12 @@ def __init__( random_state: Optional[int] = None, ): super().__init__() - self.train_index = train_index - self.valid_index = valid_index - self.test_index = test_index + self.train_index: Optional[AnnotatedSpectrumIndex] = train_index + self.valid_index: Optional[AnnotatedSpectrumIndex] = valid_index + self.test_index: Optional[AnnotatedSpectrumIndex] = test_index self.train_batch_size = train_batch_size self.eval_batch_size = eval_batch_size - self.n_peaks = n_peaks + self.n_peaks: Optional[int] = n_peaks self.min_mz = min_mz self.max_mz = max_mz self.min_intensity = min_intensity @@ -98,11 +95,11 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: Parameters ---------- stage : str {"fit", "validate", "test"} - The stage indicating which Datasets to prepare. All are prepared by - default. + The stage indicating which Datasets to prepare. All are + prepared by default. annotated: bool - True if peptide sequence annotations are available for the test - data. + True if peptide sequence annotations are available for the + test data. """ if stage in (None, "fit", "validate"): make_dataset = functools.partial( @@ -186,7 +183,7 @@ def predict_dataloader(self) -> torch.utils.data.DataLoader: return self._make_loader(self.test_dataset, self.eval_batch_size) def db_dataloader(self) -> torch.utils.data.DataLoader: - """Get a special dataloader for DB search""" + """Get a special dataloader for DB search.""" return self._make_loader( self.test_dataset, self.eval_batch_size, @@ -202,21 +199,23 @@ def prepare_batch( """ Collate MS/MS spectra into a batch. - The MS/MS spectra will be padded so that they fit nicely as a tensor. - However, the padded elements are ignored during the subsequent steps. + The MS/MS spectra will be padded so that they fit nicely as a + tensor. However, the padded elements are ignored during the + subsequent steps. Parameters ---------- batch : List[Tuple[torch.Tensor, float, int, str]] - A batch of data from an AnnotatedSpectrumDataset, consisting of for each - spectrum (i) a tensor with the m/z and intensity peak values, (ii), the - precursor m/z, (iii) the precursor charge, (iv) the spectrum identifier. + A batch of data from an AnnotatedSpectrumDataset, consisting of + for each spectrum (i) a tensor with the m/z and intensity peak + values, (ii), the precursor m/z, (iii) the precursor charge, + (iv) the spectrum identifier. Returns ------- spectra : torch.Tensor of shape (batch_size, n_peaks, 2) - The padded mass spectra tensor with the m/z and intensity peak values - for each spectrum. + The padded mass spectra tensor with the m/z and intensity peak + values for each spectrum. precursors : torch.Tensor of shape (batch_size, 3) A tensor with the precursor neutral mass, precursor charge, and precursor m/z. @@ -229,80 +228,75 @@ def prepare_batch( precursor_mzs = torch.tensor(precursor_mzs) precursor_charges = torch.tensor(precursor_charges) precursor_masses = (precursor_mzs - 1.007276) * precursor_charges - precursors = torch.vstack( + precursors = torch.hstack( [precursor_masses, precursor_charges, precursor_mzs] - ).T.float() + ).float() return spectra, precursors, np.asarray(spectrum_ids) def prepare_psm_batch( batch: List[Tuple[torch.Tensor, float, int, str]], protein_database: db_utils.ProteinDatabase, -) -> Tuple[torch.Tensor, torch.Tensor, np.ndarray, List[str], List[str]]: +) -> Tuple[torch.Tensor, torch.Tensor, np.ndarray, np.ndarray]: """ Collate MS/MS spectra into a batch for DB search. - The MS/MS spectra will be padded so that they fit nicely as a tensor. - However, the padded elements are ignored during the subsequent steps. + The MS/MS spectra will be padded so that they fit nicely as a + tensor. However, the padded elements are ignored during the + subsequent steps. Parameters ---------- batch : List[Tuple[torch.Tensor, float, int, str]] - A batch of data from an AnnotatedSpectrumDataset, consisting of for each - spectrum (i) a tensor with the m/z and intensity peak values, (ii), the - precursor m/z, (iii) the precursor charge, (iv) the spectrum identifier. + A batch of data from an AnnotatedSpectrumDataset, consisting of + for each spectrum (i) a tensor with the m/z and intensity peak + values, (ii), the precursor m/z, (iii) the precursor charge, + (iv) the spectrum identifier. protein_database : db_utils.ProteinDatabase The protein database to use for candidate peptide retrieval. Returns ------- - all_spectra : torch.Tensor of shape (batch_size, n_peaks, 2) - The padded mass spectra tensor with the m/z and intensity peak values - for each spectrum. - all_precursors : torch.Tensor of shape (batch_size, 3) + batch_spectra : torch.Tensor of shape (batch_size, n_peaks, 2) + The padded mass spectra tensor with the m/z and intensity peak + values for each spectrum. + batch_precursors : torch.Tensor of shape (batch_size, 3) A tensor with the precursor neutral mass, precursor charge, and precursor m/z. - all_spectrum_ids : np.ndarray + batch_spectrum_ids : np.ndarray The spectrum identifiers. - all_peptides : List[str] + batch_peptides : np.ndarray The candidate peptides for each spectrum. """ - spectra, precursor_mzs, precursor_charges, spectrum_ids = list(zip(*batch)) - spectra = torch.nn.utils.rnn.pad_sequence(spectra, batch_first=True) - - precursor_mzs_t = torch.tensor(precursor_mzs) - precursor_charges_t = torch.tensor(precursor_charges) - precursor_masses_t = (precursor_mzs_t - 1.007276) * precursor_charges_t - precursors = torch.vstack( - [precursor_masses_t, precursor_charges_t, precursor_mzs_t] - ).T.float() - - all_spectra = [] - all_precursors = [] - all_spectrum_ids = [] - all_peptides = [] - for idx in range(len(batch)): - spec_peptides = protein_database.get_candidates( - precursor_mzs[idx], - precursor_charges[idx], + spectra, precursors, spectrum_ids = prepare_batch(batch) + + batch_spectra = [] + batch_precursors = [] + batch_spectrum_ids = [] + batch_peptides = [] + # FIXME: This can be optmized by using a sliding window instead of + # retrieving candidates for each spectrum indendently. + for i in range(len(batch)): + candidate_pep = protein_database.get_candidates( + precursors[i][2], precursors[i][1] ) - try: - all_spectra.append( - spectra[idx].unsqueeze(0).repeat(len(spec_peptides), 1, 1) + if len(candidate_pep) == 0: + logger.info( + "No candidate peptides found for spectrum %s", spectrum_ids[i] ) - all_precursors.append( - precursors[idx].unsqueeze(0).repeat(len(spec_peptides), 1) + else: + batch_spectra.append( + spectra[i].unsqueeze(0).repeat(len(candidate_pep), 1, 1) ) - all_spectrum_ids.extend([spectrum_ids[idx]] * len(spec_peptides)) - all_peptides.extend(spec_peptides) - except ValueError: - logger.warning( - "No candidates found for spectrum %s", spectrum_ids[idx] + batch_precursors.append( + precursors[i].unsqueeze(0).repeat(len(candidate_pep), 1) ) + batch_spectrum_ids.extend([spectrum_ids[i]] * len(candidate_pep)) + batch_peptides.extend(candidate_pep) return ( - torch.cat(all_spectra, dim=0), - torch.cat(all_precursors, dim=0), - all_spectrum_ids, - all_peptides, + torch.cat(batch_spectra, dim=0), + torch.cat(batch_precursors, dim=0), + np.asarray(batch_spectrum_ids), + np.asarray(batch_peptides), ) From ee784421b7981717817168614c569f2818d2f432 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Sun, 10 Nov 2024 15:46:37 +0100 Subject: [PATCH 66/84] More minor refactoring --- casanovo/config.py | 8 +- casanovo/config.yaml | 2 +- casanovo/data/psm.py | 19 +- casanovo/denovo/model.py | 477 ++++++++++++++++---------------- casanovo/denovo/model_runner.py | 90 +++--- 5 files changed, 298 insertions(+), 298 deletions(-) diff --git a/casanovo/config.py b/casanovo/config.py index ea25428c..e276e12d 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -13,8 +13,8 @@ logger = logging.getLogger("casanovo") -# FIXME: This contains deprecated config options to be removed in the next major -# version update. +# FIXME: This contains deprecated config options to be removed in the next +# major version update. _config_deprecated = dict( every_n_train_steps="val_check_interval", max_iters="cosine_schedule_period_iters", @@ -27,8 +27,8 @@ class Config: """The Casanovo configuration options. - If a parameter is missing from a user's configuration file, the default - value is assumed. + If a parameter is missing from a user's configuration file, the + default value is assumed. Parameters ---------- diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 014f02ee..b7179347 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -33,7 +33,7 @@ devices: ### -# The following parameters are unique to Casanovo's inference/finetuning mode. +# The following parameters are unique to Casanovo's de novo sequencing mode. ### # Number of beams used in beam search. diff --git a/casanovo/data/psm.py b/casanovo/data/psm.py index e4ef3af7..eece07a4 100644 --- a/casanovo/data/psm.py +++ b/casanovo/data/psm.py @@ -1,4 +1,4 @@ -"""Peptide spectrum match dataclass""" +"""Peptide spectrum match dataclass.""" import dataclasses from typing import Tuple, Iterable @@ -15,23 +15,24 @@ class PepSpecMatch: The amino acid sequence of the peptide. spectrum_id : Tuple[str, str] A tuple containing the spectrum identifier in the form - (spectrum file name, spectrum file idx) + (spectrum file name, spectrum file idx). peptide_score : float Score of the match between the full peptide sequence and the spectrum. charge : int - The precursor charge state of the peptide ion observed in the spectrum. + The precursor charge state of the peptide ion observed in the + spectrum. calc_mz : float - The calculated mass-to-charge ratio (m/z) of the peptide based on its - sequence and charge state. + The calculated mass-to-charge ratio (m/z) of the peptide based + on its sequence and charge state. exp_mz : float - The observed (experimental) precursor mass-to-charge ratio (m/z) of the - peptide as detected in the spectrum. + The observed (experimental) precursor mass-to-charge ratio (m/z) + of the peptide as detected in the spectrum. aa_scores : Iterable[float] A list of scores for individual amino acids in the peptide - sequence, where len(aa_scores) == len(sequence) + sequence, where len(aa_scores) == len(sequence). protein : str - Protein associated with the peptide sequence (for db mode) + Protein associated with the peptide sequence (for db mode). """ sequence: str diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 5e807153..d309d11c 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -4,6 +4,7 @@ import heapq import logging import warnings +from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import depthcharge.masses @@ -32,37 +33,39 @@ class Spec2Pep(pl.LightningModule, ModelMixin): dim_model : int The latent dimensionality used by the transformer model. n_head : int - The number of attention heads in each layer. ``dim_model`` must be - divisible by ``n_head``. + The number of attention heads in each layer. ``dim_model`` must + be divisible by ``n_head``. dim_feedforward : int - The dimensionality of the fully connected layers in the transformer - model. + The dimensionality of the fully connected layers in the + transformer model. n_layers : int The number of transformer layers. dropout : float The dropout probability for all layers. dim_intensity : Optional[int] - The number of features to use for encoding peak intensity. The remaining - (``dim_model - dim_intensity``) are reserved for encoding the m/z value. - If ``None``, the intensity will be projected up to ``dim_model`` using a - linear layer, then summed with the m/z encoding for each peak. + The number of features to use for encoding peak intensity. The + remaining (``dim_model - dim_intensity``) are reserved for + encoding the m/z value. If ``None``, the intensity will be + projected up to ``dim_model`` using a linear layer, then summed + with the m/z encoding for each peak. max_peptide_len : int The maximum peptide length to decode. residues : Union[Dict[str, float], str] - The amino acid dictionary and their masses. By default ("canonical) this - is only the 20 canonical amino acids, with cysteine carbamidomethylated. - If "massivekb", this dictionary will include the modifications found in - MassIVE-KB. Additionally, a dictionary can be used to specify a custom + The amino acid dictionary and their masses. By default + ("canonical) this is only the 20 canonical amino acids, with + cysteine carbamidomethylated. If "massivekb", this dictionary + will include the modifications found in MassIVE-KB. + Additionally, a dictionary can be used to specify a custom collection of amino acids and masses. max_charge : int The maximum precursor charge to consider. precursor_mass_tol : float, optional - The maximum allowable precursor mass tolerance (in ppm) for correct - predictions. + The maximum allowable precursor mass tolerance (in ppm) for + correct predictions. isotope_error_range : Tuple[int, int] - Take into account the error introduced by choosing a non-monoisotopic - peak for fragmentation by not penalizing predicted precursor m/z's that - fit the specified isotope error: + Take into account the error introduced by choosing a + non-monoisotopic peak for fragmentation by not penalizing + predicted precursor m/z's that fit the specified isotope error: `abs(calc_mz - (precursor_mz - isotope * 1.00335 / precursor_charge)) < precursor_mass_tol` min_peptide_len : int @@ -73,16 +76,18 @@ class Spec2Pep(pl.LightningModule, ModelMixin): Number of PSMs to return for each spectrum. n_log : int The number of epochs to wait between logging messages. - tb_summarywriter : Optional[str] - Folder path to record performance metrics during training. If ``None``, - don't use a ``SummaryWriter``. + tb_summarywriter : Optional[Path] + Folder path to record performance metrics during training. If + ``None``, don't use a ``SummaryWriter``. train_label_smoothing : float Smoothing factor when calculating the training loss. warmup_iters : int - The number of iterations for the linear warm-up of the learning rate. + The number of iterations for the linear warm-up of the learning + rate. cosine_schedule_period_iters : int - The number of iterations for the cosine half period of the learning rate. - out_writer : Optional[str] + The number of iterations for the cosine half period of the + learning rate. + out_writer : Optional[ms_io.MztabWriter] The output writer for the prediction results. calculate_precision : bool Calculate the validation set precision during training. @@ -108,9 +113,7 @@ def __init__( n_beams: int = 1, top_match: int = 1, n_log: int = 10, - tb_summarywriter: Optional[ - torch.utils.tensorboard.SummaryWriter - ] = None, + tb_summarywriter: Optional[Path] = None, train_label_smoothing: float = 0.01, warmup_iters: int = 100_000, cosine_schedule_period_iters: int = 600_000, @@ -147,8 +150,9 @@ def __init__( # Optimizer settings. self.warmup_iters = warmup_iters self.cosine_schedule_period_iters = cosine_schedule_period_iters - # `kwargs` will contain additional arguments as well as unrecognized - # arguments, including deprecated ones. Remove the deprecated ones. + # `kwargs` will contain additional arguments as well as + # unrecognized arguments, including deprecated ones. Remove the + # deprecated ones. for k in config._config_deprecated: kwargs.pop(k, None) warnings.warn( @@ -175,12 +179,12 @@ def __init__( self.n_log = n_log self._history = [] if tb_summarywriter is not None: - self.tb_summarywriter = SummaryWriter(tb_summarywriter) + self.tb_summarywriter = SummaryWriter(str(tb_summarywriter)) else: - self.tb_summarywriter = tb_summarywriter + self.tb_summarywriter = None # Output writer during predicting. - self.out_writer = out_writer + self.out_writer: ms_io.MztabWriter = out_writer def forward( self, spectra: torch.Tensor, precursors: torch.Tensor @@ -192,20 +196,22 @@ def forward( ---------- spectra : torch.Tensor of shape (n_spectra, n_peaks, 2) The spectra for which to predict peptide sequences. - Axis 0 represents an MS/MS spectrum, axis 1 contains the peaks in - the MS/MS spectrum, and axis 2 is essentially a 2-tuple specifying - the m/z-intensity pair for each peak. These should be zero-padded, - such that all the spectra in the batch are the same length. + Axis 0 represents an MS/MS spectrum, axis 1 contains the + peaks in the MS/MS spectrum, and axis 2 is essentially a + 2-tuple specifying the m/z-intensity pair for each peak. + These should be zero-padded, such that all the spectra in + the batch are the same length. precursors : torch.Tensor of size (n_spectra, 3) - The measured precursor mass (axis 0), precursor charge (axis 1), and - precursor m/z (axis 2) of each MS/MS spectrum. + The measured precursor mass (axis 0), precursor charge + (axis 1), and precursor m/z (axis 2) of each MS/MS spectrum. Returns ------- pred_peptides : List[List[Tuple[float, np.ndarray, str]]] - For each spectrum, a list with the top peptide predictions. A - peptide predictions consists of a tuple with the peptide score, - the amino acid scores, and the predicted peptide sequence. + For each spectrum, a list with the top peptide predictions. + A peptide predictions consists of a tuple with the peptide + score, the amino acid scores, and the predicted peptide + sequence. """ return self.beam_search_decode( spectra.to(self.encoder.device), @@ -222,20 +228,22 @@ def beam_search_decode( ---------- spectra : torch.Tensor of shape (n_spectra, n_peaks, 2) The spectra for which to predict peptide sequences. - Axis 0 represents an MS/MS spectrum, axis 1 contains the peaks in - the MS/MS spectrum, and axis 2 is essentially a 2-tuple specifying - the m/z-intensity pair for each peak. These should be zero-padded, - such that all the spectra in the batch are the same length. + Axis 0 represents an MS/MS spectrum, axis 1 contains the + peaks in the MS/MS spectrum, and axis 2 is essentially a + 2-tuple specifying the m/z-intensity pair for each peak. + These should be zero-padded, such that all the spectra in + the batch are the same length. precursors : torch.Tensor of size (n_spectra, 3) - The measured precursor mass (axis 0), precursor charge (axis 1), and - precursor m/z (axis 2) of each MS/MS spectrum. + The measured precursor mass (axis 0), precursor charge + (axis 1), and precursor m/z (axis 2) of each MS/MS spectrum. Returns ------- pred_peptides : List[List[Tuple[float, np.ndarray, str]]] - For each spectrum, a list with the top peptide prediction(s). A - peptide predictions consists of a tuple with the peptide score, - the amino acid scores, and the predicted peptide sequence. + For each spectrum, a list with the top peptide + prediction(s). A peptide predictions consists of a tuple + with the peptide score, the amino acid scores, and the + predicted peptide sequence. """ memories, mem_masks = self.encoder(spectra) @@ -270,15 +278,16 @@ def beam_search_decode( # The main decoding loop. for step in range(0, self.max_peptide_len): - # Terminate beams exceeding the precursor m/z tolerance and track - # all finished beams (either terminated or stop token predicted). + # Terminate beams exceeding the precursor m/z tolerance and + # track all finished beams (either terminated or stop token + # predicted). ( finished_beams, beam_fits_precursor, discarded_beams, ) = self._finish_beams(tokens, precursors, step) - # Cache peptide predictions from the finished beams (but not the - # discarded beams). + # Cache peptide predictions from the finished beams (but not + # the discarded beams). self._cache_finished_beams( tokens, scores, @@ -289,7 +298,8 @@ def beam_search_decode( ) # Stop decoding when all current beams have been finished. - # Continue with beams that have not been finished and not discarded. + # Continue with beams that have not been finished and not + # discarded. finished_beams |= discarded_beams if finished_beams.all(): break @@ -300,14 +310,14 @@ def beam_search_decode( memories[~finished_beams, :, :], mem_masks[~finished_beams, :], ) - # Find the top-k beams with the highest scores and continue decoding - # those. + # Find the top-k beams with the highest scores and continue + # decoding those. tokens, scores = self._get_topk_beams( tokens, scores, finished_beams, batch, step + 1 ) - # Return the peptide with the highest confidence score, within the - # precursor m/z tolerance if possible. + # Return the peptide with the highest confidence score, within + # the precursor m/z tolerance if possible. return list(self._get_top_peptide(pred_cache)) def _finish_beams( @@ -317,9 +327,9 @@ def _finish_beams( step: int, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ - Track all beams that have been finished, either by predicting the stop - token or because they were terminated due to exceeding the precursor - m/z tolerance. + Track all beams that have been finished, either by predicting + the stop token or because they were terminated due to exceeding + the precursor m/z tolerance. Parameters ---------- @@ -327,23 +337,23 @@ def _finish_beams( Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len, n_amino_acids) - Scores for the predicted amino acid tokens for all beams and all - spectra. + Scores for the predicted amino acid tokens for all beams and + all spectra. step : int Index of the current decoding step. Returns ------- finished_beams : torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating whether the current beams have been - finished. + Boolean tensor indicating whether the current beams have + been finished. beam_fits_precursor: torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating if current beams are within precursor m/z - tolerance. + Boolean tensor indicating if current beams are within + precursor m/z tolerance. discarded_beams : torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating whether the current beams should be - discarded (e.g. because they were predicted to end but violate the - minimum peptide length). + Boolean tensor indicating whether the current beams should + be discarded (e.g. because they were predicted to end but + violate the minimum peptide length). """ # Check for tokens with a negative mass (i.e. neutral loss). aa_neg_mass = [None] @@ -362,7 +372,8 @@ def _finish_beams( beam_fits_precursor = torch.zeros( tokens.shape[0], dtype=torch.bool ).to(self.encoder.device) - # Beams with a stop token predicted in the current step can be finished. + # Beams with a stop token predicted in the current step can be + # finished. finished_beams = torch.zeros(tokens.shape[0], dtype=torch.bool).to( self.encoder.device ) @@ -374,8 +385,9 @@ def _finish_beams( self.encoder.device ) discarded_beams[tokens[:, step] == 0] = True - # Discard beams with invalid modification combinations (i.e. N-terminal - # modifications occur multiple times or in internal positions). + # Discard beams with invalid modification combinations (i.e. + # N-terminal modifications occur multiple times or in internal + # positions). if step > 1: # Only relevant for longer predictions. dim0 = torch.arange(tokens.shape[0]) final_pos = torch.full((ends_stop_token.shape[0],), step) @@ -392,8 +404,8 @@ def _finish_beams( ).any(dim=1) discarded_beams[multiple_mods | internal_mods] = True - # Check which beams should be terminated or discarded based on the - # predicted peptide. + # Check which beams should be terminated or discarded based on + # the predicted peptide. for i in range(len(finished_beams)): # Skip already discarded beams. if discarded_beams[i]: @@ -408,15 +420,15 @@ def _finish_beams( elif not self.decoder.reverse and peptide[-1] == "$": peptide = peptide[:-1] peptide_len -= 1 - # Discard beams that were predicted to end but don't fit the minimum - # peptide length. + # Discard beams that were predicted to end but don't fit the + # minimum peptide length. if finished_beams[i] and peptide_len < self.min_peptide_len: discarded_beams[i] = True continue - # Terminate the beam if it has not been finished by the model but - # the peptide mass exceeds the precursor m/z to an extent that it - # cannot be corrected anymore by a subsequently predicted AA with - # negative mass. + # Terminate the beam if it has not been finished by the + # model but the peptide mass exceeds the precursor m/z to an + # extent that it cannot be corrected anymore by a + # subsequently predicted AA with negative mass. precursor_charge = precursors[i, 1] precursor_mz = precursors[i, 2] matches_precursor_mz = exceeds_precursor_mz = False @@ -442,16 +454,18 @@ def _finish_beams( self.isotope_error_range[1] + 1, ) ] - # Terminate the beam if the calculated m/z for the predicted - # peptide (without potential additional AAs with negative - # mass) is within the precursor m/z tolerance. + # Terminate the beam if the calculated m/z for the + # predicted peptide (without potential additional + # AAs with negative mass) is within the precursor + # m/z tolerance. matches_precursor_mz = aa is None and any( abs(d) < self.precursor_mass_tol for d in delta_mass_ppm ) - # Terminate the beam if the calculated m/z exceeds the - # precursor m/z + tolerance and hasn't been corrected by a - # subsequently predicted AA with negative mass. + # Terminate the beam if the calculated m/z exceeds + # the precursor m/z + tolerance and hasn't been + # corrected by a subsequently predicted AA with + # negative mass. if matches_precursor_mz: exceeds_precursor_mz = False else: @@ -466,8 +480,8 @@ def _finish_beams( except KeyError: matches_precursor_mz = exceeds_precursor_mz = False # Finish beams that fit or exceed the precursor m/z. - # Don't finish beams that don't include a stop token if they don't - # exceed the precursor m/z tolerance yet. + # Don't finish beams that don't include a stop token if they + # don't exceed the precursor m/z tolerance yet. if finished_beams[i]: beam_fits_precursor[i] = matches_precursor_mz elif exceeds_precursor_mz: @@ -495,13 +509,13 @@ def _cache_finished_beams( Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len, n_amino_acids) - Scores for the predicted amino acid tokens for all beams and all - spectra. + Scores for the predicted amino acid tokens for all beams and + all spectra. step : int Index of the current decoding step. beams_to_cache : torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating whether the current beams are ready for - caching. + Boolean tensor indicating whether the current beams are + ready for caching. beam_fits_precursor: torch.Tensor of shape (n_spectra * n_beams) Boolean tensor indicating whether the beams are within the precursor m/z tolerance. @@ -509,9 +523,9 @@ def _cache_finished_beams( int, List[Tuple[float, float, np.ndarray, torch.Tensor]] ] Priority queue with finished beams for each spectrum, ordered by - peptide score. For each finished beam, a tuple with the (negated) - peptide score, a random tie-breaking float, the amino acid-level - scores, and the predicted tokens is stored. + peptide score. For each finished beam, a tuple with the + (negated) peptide score, a random tie-breaking float, the + amino acid-level scores, and the predicted tokens is stored. """ for i in range(len(beams_to_cache)): if not beams_to_cache[i]: @@ -533,8 +547,8 @@ def _cache_finished_beams( continue smx = self.softmax(scores[i : i + 1, : step + 1, :]) aa_scores = smx[0, range(len(pred_tokens)), pred_tokens].tolist() - # Add an explicit score 0 for the missing stop token in case this - # was not predicted (i.e. early stopping). + # Add an explicit score 0 for the missing stop token in case + # this was not predicted (i.e. early stopping). if not has_stop_token: aa_scores.append(0) aa_scores = np.asarray(aa_scores) @@ -544,8 +558,8 @@ def _cache_finished_beams( ) # Omit the stop token from the amino acid-level scores. aa_scores = aa_scores[:-1] - # Add the prediction to the cache (minimum priority queue, maximum - # the number of beams elements). + # Add the prediction to the cache (minimum priority queue, + # maximum the number of beams elements). if len(pred_cache[spec_idx]) < self.n_beams: heapadd = heapq.heappush else: @@ -569,8 +583,8 @@ def _get_topk_beams( step: int, ) -> Tuple[torch.tensor, torch.tensor]: """ - Find the top-k beams with the highest scores and continue decoding - those. + Find the top-k beams with the highest scores and continue + decoding those. Stop decoding for beams that have been finished. @@ -580,11 +594,11 @@ def _get_topk_beams( Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len, n_amino_acids) - Scores for the predicted amino acid tokens for all beams and all - spectra. + Scores for the predicted amino acid tokens for all beams and + all spectra. finished_beams : torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating whether the current beams are ready for - caching. + Boolean tensor indicating whether the current beams are + ready for caching. batch: int Number of spectra in the batch. step : int @@ -596,8 +610,8 @@ def _get_topk_beams( Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len, n_amino_acids) - Scores for the predicted amino acid tokens for all beams and all - spectra. + Scores for the predicted amino acid tokens for all beams and + all spectra. """ beam = self.n_beams # S vocab = self.decoder.vocab_size + 1 # V @@ -632,7 +646,7 @@ def _get_topk_beams( ).float() # Mask out the index '0', i.e. padding token, by default. # FIXME: Set this to a very small, yet non-zero value, to only - # get padding after stop token. + # get padding after stop token. active_mask[:, :beam] = 1e-8 # Figure out the top K decodings. @@ -660,24 +674,26 @@ def _get_top_peptide( ], ) -> Iterable[List[Tuple[float, np.ndarray, str]]]: """ - Return the peptide with the highest confidence score for each spectrum. + Return the peptide with the highest confidence score for each + spectrum. Parameters ---------- pred_cache : Dict[ int, List[Tuple[float, float, np.ndarray, torch.Tensor]] ] - Priority queue with finished beams for each spectrum, ordered by - peptide score. For each finished beam, a tuple with the peptide - score, a random tie-breaking float, the amino acid-level scores, - and the predicted tokens is stored. + Priority queue with finished beams for each spectrum, + ordered by peptide score. For each finished beam, a tuple + with the peptide score, a random tie-breaking float, the + amino acid-level scores, and the predicted tokens is stored. Returns ------- pred_peptides : Iterable[List[Tuple[float, np.ndarray, str]]] - For each spectrum, a list with the top peptide prediction(s). A - peptide predictions consists of a tuple with the peptide score, - the amino acid scores, and the predicted peptide sequence. + For each spectrum, a list with the top peptide + prediction(s). A peptide predictions consists of a tuple + with the peptide score, the amino acid scores, and the + predicted peptide sequence. """ for peptides in pred_cache.values(): if len(peptides) > 0: @@ -707,13 +723,14 @@ def _forward_step( ---------- spectra : torch.Tensor of shape (n_spectra, n_peaks, 2) The spectra for which to predict peptide sequences. - Axis 0 represents an MS/MS spectrum, axis 1 contains the peaks in - the MS/MS spectrum, and axis 2 is essentially a 2-tuple specifying - the m/z-intensity pair for each peak. These should be zero-padded, - such that all the spectra in the batch are the same length. + Axis 0 represents an MS/MS spectrum, axis 1 contains the + peaks in the MS/MS spectrum, and axis 2 is essentially a + 2-tuple specifying the m/z-intensity pair for each peak. + These should be zero-padded, such that all the spectra in + the batch are the same length. precursors : torch.Tensor of size (n_spectra, 3) - The measured precursor mass (axis 0), precursor charge (axis 1), and - precursor m/z (axis 2) of each MS/MS spectrum. + The measured precursor mass (axis 0), precursor charge + (axis 1), and precursor m/z (axis 2) of each MS/MS spectrum. sequences : List[str] of length n_spectra The partial peptide sequences to predict. @@ -738,8 +755,8 @@ def training_step( Parameters ---------- batch : Tuple[torch.Tensor, torch.Tensor, List[str]] - A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - peptide sequences as torch Tensors. + A batch of (i) MS/MS spectra, (ii) precursor information, + (iii) peptide sequences as torch Tensors. mode : str Logging key to describe the current stage. @@ -772,8 +789,8 @@ def validation_step( Parameters ---------- batch : Tuple[torch.Tensor, torch.Tensor, List[str]] - A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - peptide sequences. + A batch of (i) MS/MS spectra, (ii) precursor information, + (iii) peptide sequences. Returns ------- @@ -785,8 +802,8 @@ def validation_step( if not self.calculate_precision: return loss - # Calculate and log amino acid and peptide match evaluation metrics from - # the predicted peptides. + # Calculate and log amino acid and peptide match evaluation + # metrics from the predicted peptides. peptides_pred, peptides_true = [], batch[2] for spectrum_preds in self.forward(batch[0], batch[1]): for _, _, pred in spectrum_preds: @@ -794,42 +811,30 @@ def validation_step( aa_precision, _, pep_precision = evaluate.aa_match_metrics( *evaluate.aa_match_batch( - peptides_true, - peptides_pred, - self.decoder._peptide_mass.masses, + peptides_true, peptides_pred, self.decoder._peptide_mass.masses ) ) log_args = dict(on_step=False, on_epoch=True, sync_dist=True) - self.log( - "Peptide precision at coverage=1", - pep_precision, - **log_args, - ) - self.log( - "AA precision at coverage=1", - aa_precision, - **log_args, - ) + self.log("Peptide precision at coverage=1", pep_precision, **log_args) + self.log("AA precision at coverage=1", aa_precision, **log_args) return loss def predict_step( self, batch: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], *args - ) -> List[Tuple[np.ndarray, float, float, str, float, np.ndarray]]: + ) -> List[ms_io.PepSpecMatch]: """ A single prediction step. Parameters ---------- batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor] - A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - spectrum identifiers as torch Tensors. + A batch of (i) MS/MS spectra, (ii) precursor information, + (iii) spectrum identifiers as torch Tensors. Returns ------- - predictions: List[Tuple[np.ndarray, float, float, str, float, np.ndarray]] - Model predictions for the given batch of spectra containing spectrum - ids, precursor information, peptide sequences as well as peptide - and amino acid-level confidence scores. + predictions: List[ms_io.PepSpecMatch] + Predicted PSMs for the given batch of spectra. """ predictions = [] for ( @@ -845,13 +850,16 @@ def predict_step( ): for peptide_score, aa_scores, peptide in spectrum_preds: predictions.append( - ( - spectrum_i, - precursor_charge, - precursor_mz, - peptide, - peptide_score, - aa_scores, + ms_io.PepSpecMatch( + sequence=peptide, + spectrum_id=tuple(spectrum_i), + peptide_score=peptide_score, + charge=int(precursor_charge), + calc_mz=self.peptide_mass_calculator.mass( + peptide, precursor_charge + ), + exp_mz=precursor_mz, + aa_scores=aa_scores, ) ) @@ -897,8 +905,8 @@ def on_predict_batch_end( *args, ) -> None: """ - Write the predicted peptide sequences and amino acid scores to the - output file. + Write the predicted peptide sequences and amino acid scores to + the output file. """ if self.out_writer is None: return @@ -970,16 +978,18 @@ def _log_history(self) -> None: def configure_optimizers( self, - ) -> Tuple[torch.optim.Optimizer, Dict[str, Any]]: + ) -> Tuple[List[torch.optim.Optimizer], Dict[str, Any]]: """ Initialize the optimizer. - This is used by pytorch-lightning when preparing the model for training. + This is used by pytorch-lightning when preparing the model for + training. Returns ------- - Tuple[torch.optim.Optimizer, Dict[str, Any]] - The initialized Adam optimizer and its learning rate scheduler. + Tuple[List[torch.optim.Optimizer], Dict[str, Any]] + The initialized Adam optimizer and its learning rate + scheduler. """ optimizer = torch.optim.Adam(self.parameters(), **self.opt_kwargs) # Apply learning rate scheduler per step. @@ -991,8 +1001,8 @@ def configure_optimizers( class DbSpec2Pep(Spec2Pep): """ - Subclass of Spec2Pep for the use of Casanovo as an - MS/MS database search score function. + Subclass of Spec2Pep for the use of Casanovo as an MS/MS database + search score function. Uses teacher forcing to 'query' Casanovo to score a peptide-spectrum pair. Higher scores indicate a better match between the peptide and @@ -1006,34 +1016,35 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.psm_batch_size = None - def predict_step(self, batch, *args): + def predict_step( + self, + batch: Tuple[torch.Tensor, torch.Tensor, np.ndarray, np.ndarray], + *args + ) -> List[ms_io.PepSpecMatch]: """ - A single prediction step for Casanovo-DB + A single prediction step. Parameters ---------- - batch : Tuple[torch.Tensor, torch.Tensor, np.array, List[str]] - A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - spectrum identifiers, (iv) candidate peptides + batch : Tuple[torch.Tensor, torch.Tensor, np.ndarray, np.ndarray] + A batch of (i) MS/MS spectra, (ii) precursor information, + (iii) spectrum identifiers, (iv) candidate peptides. Returns ------- - predictions: List[Tuple[List[str], int, float, str, np.ndarray, np.ndarray, str]] - Model predictions for the given batch of spectra containing spectrum - ids, precursor charge and m/z, candidate peptide sequences, peptide - scores, amino acid-level scores, and associated proteins. - Stored separately by spectrum id. + predictions: List[ms_io.PepSpecMatch] + Predicted PSMs for the given batch of spectra. """ - store_dict = collections.defaultdict(list) - for start_idx in range(0, len(batch[0]), self.psm_batch_size): - current_batch = [ - b[start_idx : start_idx + self.psm_batch_size] for b in batch + predictions_all = collections.defaultdict(list) + for start_i in range(0, len(batch[0]), self.psm_batch_size): + psm_batch = [ + b[start_i : start_i + self.psm_batch_size] for b in batch ] pred, truth = self._forward_step( - current_batch[0], current_batch[1], current_batch[3] + psm_batch[0], psm_batch[1], psm_batch[3] ) pred = self.softmax(pred) - all_peptide_scores, all_aa_scores = _calc_match_score( + batch_peptide_scores, batch_aa_scores = _calc_match_score( pred, truth, self.decoder.reverse ) for ( @@ -1044,46 +1055,44 @@ def predict_step(self, batch, *args): aa_scores, peptide, ) in zip( - current_batch[1][:, 1].cpu().detach().numpy(), - current_batch[1][:, 2].cpu().detach().numpy(), - current_batch[2], - all_peptide_scores, - all_aa_scores, - current_batch[3], + psm_batch[1][:, 1].cpu().detach().numpy(), + psm_batch[1][:, 2].cpu().detach().numpy(), + psm_batch[2], + batch_peptide_scores, + batch_aa_scores, + psm_batch[3], ): - store_dict[spectrum_i].append( + predictions_all[spectrum_i].append( ms_io.PepSpecMatch( sequence=peptide, spectrum_id=tuple(spectrum_i), peptide_score=peptide_score, charge=int(charge), - calc_mz=precursor_mz, - exp_mz=self.peptide_mass_calculator.mass( + calc_mz=self.peptide_mass_calculator.mass( peptide, charge ), + exp_mz=precursor_mz, aa_scores=aa_scores, protein=self.protein_database.get_associated_protein( peptide ), ) ) - predictions = [] - for spectrum_i in store_dict: - predictions.extend( + # Filter the top-scoring prediction(s) for each spectrum. + predictions = [ + *( sorted( - store_dict[spectrum_i], - key=lambda x: x.peptide_score, + spectrum_predictions, + key=lambda p: p.peptide_score, reverse=True, )[: self.top_match] + for spectrum_predictions in predictions_all.values() ) + ] return predictions def on_predict_batch_end( - self, - outputs: List[ - Tuple[List[str], int, float, str, np.ndarray, np.ndarray, str] - ], - *args, + self, outputs: List[ms_io.PepSpecMatch], *args ) -> None: """ Write the database search results to the output file. @@ -1095,37 +1104,35 @@ def _calc_match_score( batch_all_aa_scores: torch.Tensor, truth_aa_indices: torch.Tensor, decoder_reverse: bool = False, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> Tuple[List[float], List[np.ndarray]]: """ - Calculate the score between the input spectra and associated peptide. + Calculate the score between the input spectra and associated + peptide. - Take in teacher-forced scoring of amino acids - of the peptides (in a batch) and use the truth labels - to calculate a score between the input spectra and - associated peptide. + Take in teacher-forced scoring of amino acids of the peptides (in a + batch) and use the truth labels to calculate a score between the + input spectra and associated peptide. Parameters ---------- batch_all_aa_scores : torch.Tensor - Amino acid scores for all amino acids in - the vocabulary for every prediction made to generate - the associated peptide (for an entire batch) + Amino acid scores for all amino acids in the vocabulary for + every prediction made to generate the associated peptide (for an + entire batch). truth_aa_indices : torch.Tensor - Indices of the score for each actual amino acid - in the peptide (for an entire batch) + Indices of the score for each actual amino acid in the peptide + (for an entire batch). decoder_reverse : bool Whether the decoder is reversed. Returns ------- - all_peptide_scores: List[float] - The score between the input spectra and associated peptide - for each PSM in the batch. - all_aa_scores : List[List[float]] - A list of lists of per amino acid scores - for each PSM in the batch. + peptide_scores: List[float] + The peptide score for each PSM in the batch. + aa_scores : List[np.ndarray] + The amino acid scores for each PSM in the batch. """ - # Remove trailing tokens from predictions based on decoder reversal + # Remove trailing tokens from predictions based on decoder reversal. if not decoder_reverse: batch_all_aa_scores = batch_all_aa_scores[:, 1:] else: @@ -1144,29 +1151,31 @@ def _calc_match_score( per_aa_scores[per_aa_scores == 0] += 1e-10 score_mask = truth_aa_indices != 0 per_aa_scores[~score_mask] = 0 - all_peptide_scores = [] - all_aa_scores = [] + peptide_scores, aa_scores = [], [] for psm_score in per_aa_scores: psm_score = np.trim_zeros(psm_score) - aa_scores, peptide_score = _aa_pep_score(psm_score, True) - all_peptide_scores.append(peptide_score) - all_aa_scores.append(aa_scores) + psm_aa_scores, psm_peptide_score = _aa_pep_score(psm_score, True) + peptide_scores.append(psm_peptide_score) + aa_scores.append(psm_aa_scores) - return all_peptide_scores, all_aa_scores + return peptide_scores, aa_scores class CosineWarmupScheduler(torch.optim.lr_scheduler._LRScheduler): """ - Learning rate scheduler with linear warm-up followed by cosine shaped decay. + Learning rate scheduler with linear warm-up followed by cosine + shaped decay. Parameters ---------- optimizer : torch.optim.Optimizer Optimizer object. warmup_iters : int - The number of iterations for the linear warm-up of the learning rate. + The number of iterations for the linear warm-up of the learning + rate. cosine_schedule_period_iters : int - The number of iterations for the cosine half period of the learning rate. + The number of iterations for the cosine half period of the + learning rate. """ def __init__( @@ -1196,8 +1205,8 @@ def _calc_mass_error( calc_mz: float, obs_mz: float, charge: int, isotope: int = 0 ) -> float: """ - Calculate the mass error in ppm between the theoretical m/z and the observed - m/z, optionally accounting for an isotopologue mismatch. + Calculate the mass error in ppm between the theoretical m/z and the + observed m/z, optionally accounting for an isotopologue mismatch. Parameters ---------- @@ -1222,18 +1231,20 @@ def _aa_pep_score( aa_scores: np.ndarray, fits_precursor_mz: bool ) -> Tuple[np.ndarray, float]: """ - Calculate amino acid and peptide-level confidence score from the raw amino - acid scores. + Calculate amino acid and peptide-level confidence score from the raw + amino acid scores. - The peptide score is the mean of the raw amino acid scores. The amino acid - scores are the mean of the raw amino acid scores and the peptide score. + The peptide score is the mean of the raw amino acid scores. The + amino acid scores are the mean of the raw amino acid scores and the + peptide score. Parameters ---------- aa_scores : np.ndarray Amino acid level confidence scores. fits_precursor_mz : bool - Flag indicating whether the prediction fits the precursor m/z filter. + Flag indicating whether the prediction fits the precursor m/z + filter. Returns ------- diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index b097f6d5..4e61b164 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -4,7 +4,6 @@ import glob import logging import os -import re import tempfile import uuid import warnings @@ -25,7 +24,7 @@ from ..data import db_utils, ms_io from ..denovo.dataloaders import DeNovoDataModule from ..denovo.evaluate import aa_match_batch, aa_match_metrics -from ..denovo.model import Spec2Pep, DbSpec2Pep +from ..denovo.model import DbSpec2Pep, Spec2Pep logger = logging.getLogger("casanovo") @@ -45,11 +44,12 @@ class ModelRunner: The directory where checkpoint files will be saved. If `None` no checkpoint files will be saved and a warning will be triggered. output_rootname : str | None, optional - The root name for checkpoint files (e.g., checkpoints or results). If - `None` no base name will be used for checkpoint files. - overwrite_ckpt_check: bool, optional - Whether to check output_dir (if not `None`) for conflicting checkpoint + The root name for checkpoint files (e.g., checkpoints or + results). If `None` no base name will be used for checkpoint files. + overwrite_ckpt_check: bool, optional + Whether to check output_dir (if not `None`) for conflicting + checkpoint files. """ def __init__( @@ -138,11 +138,7 @@ def db_search( fasta_path : str The path with the FASTA file for database search. results_path : str - Sequencing results file path - - Returns - ------- - self + Sequencing results file path. """ self.writer = ms_io.MztabWriter(results_path) self.writer.set_metadata( @@ -189,10 +185,6 @@ def train( The path to the MS data files for training. valid_peak_path : iterable of str The path to the MS data files for validation. - - Returns - ------- - self """ self.initialize_trainer(train=True) self.initialize_model(train=True) @@ -209,16 +201,16 @@ def train( ) def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: - """Log peptide precision and amino acid precision + """Log peptide precision and amino acid precision. Calculate and log peptide precision and amino acid precision - based off of model predictions and spectrum annotations + based off of model predictions and spectrum annotations. Parameters ---------- test_index : AnnotatedSpectrumIndex - Index containing the annotated spectra used to generate model - predictions + Index containing the annotated spectra used to generate + model predictions. """ seq_pred = [] seq_true = [] @@ -245,8 +237,9 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: if self.config["top_match"] > 1: logger.warning( - "The behavior for calculating evaluation metrics is undefined when " - "the 'top_match' configuration option is set to a value greater than 1." + "The behavior for calculating evaluation metrics is undefined " + "when the 'top_match' configuration option is set to a value " + "greater than 1." ) logger.info("Peptide Precision: %.2f%%", 100 * pep_precision) @@ -261,13 +254,14 @@ def predict( ) -> None: """Predict peptide sequences with a trained Casanovo model. - Can also evaluate model during prediction if provided with annotated - peak files. + Can also evaluate model during prediction if provided with + annotated peak files. Parameters ---------- peak_path : Iterable[str] - The path with the MS data files for predicting peptide sequences. + The path with the MS data files for predicting peptide + sequences. results_path : str Sequencing results file path evaluate: bool @@ -275,10 +269,6 @@ def predict( Note: peak_path most point to annotated MS data files when running model evaluation. Files that are not an annotated peak file format will be ignored if evaluate is set to true. - - Returns - ------- - self """ self.writer = ms_io.MztabWriter(results_path) self.writer.set_metadata( @@ -363,7 +353,7 @@ def initialize_trainer(self, train: bool) -> None: self.trainer = pl.Trainer(**trainer_cfg) def initialize_model( - self, train: bool, db_search: Optional[bool] = False + self, train: bool, db_search: bool = False ) -> None: """Initialize the Casanovo model. @@ -372,7 +362,7 @@ def initialize_model( train : bool Determines whether to set the model up for model training or evaluation / inference. - db_search : Optional[bool] + db_search : bool Determines whether to use the DB search model subclass. """ tb_summarywriter = None @@ -411,7 +401,8 @@ def initialize_model( calculate_precision=self.config.calculate_precision, ) - # Reconfigurable non-architecture related parameters for a loaded model. + # Reconfigurable non-architecture related parameters for a + # loaded model. loaded_model_params = dict( max_peptide_len=self.config.max_peptide_len, precursor_mass_tol=self.config.precursor_mass_tol, @@ -432,10 +423,8 @@ def initialize_model( if self.model_filename is None: if db_search: - logger.error("DB search mode requires a model file") - raise ValueError( - "A model file must be provided for DB search mode" - ) + logger.error("A model file must be provided for DB search") + raise ValueError("A model file must be provided for DB search") # Train a model from scratch if no model file is provided. if train: self.model = Spec2Pep(**model_params) @@ -444,7 +433,8 @@ def initialize_model( else: logger.error("A model file must be provided") raise ValueError("A model file must be provided") - # Else a model file is provided (to continue training or for inference). + # Else a model file is provided (to continue training or for + # inference). if not Path(self.model_filename).exists(): logger.error( @@ -453,15 +443,13 @@ def initialize_model( ) raise FileNotFoundError("Could not find the model weights file") - # First try loading model details from the weights file, otherwise use - # the provided configuration. + # First try loading model details from the weights file, + # otherwise use the provided configuration. device = torch.empty(1).device # Use the default device. Model = DbSpec2Pep if db_search else Spec2Pep try: self.model = Model.load_from_checkpoint( - self.model_filename, - map_location=device, - **loaded_model_params, + self.model_filename, map_location=device, **loaded_model_params ) architecture_params = set(model_params.keys()) - set( @@ -476,7 +464,8 @@ def initialize_model( "using the checkpoint." ) except RuntimeError: - # This only doesn't work if the weights are from an older version + # This only doesn't work if the weights are from an older + # version. try: self.model = Model.load_from_checkpoint( self.model_filename, @@ -497,7 +486,7 @@ def initialize_data_module( Union[AnnotatedSpectrumIndex, SpectrumIndex] ] = None, ) -> None: - """Initialize the data module + """Initialize the data module. Parameters ---------- @@ -536,8 +525,8 @@ def _get_index( ) -> Union[SpectrumIndex, AnnotatedSpectrumIndex]: """Get the spectrum index. - If the file is a SpectrumIndex, only one is allowed. Otherwise multiple - may be specified. + If the file is a SpectrumIndex, only one is allowed. Otherwise + multiple may be specified. Parameters ---------- @@ -597,15 +586,14 @@ def _get_index( def _get_strategy(self) -> Union[str, DDPStrategy]: """Get the strategy for the Trainer. - The DDP strategy works best when multiple GPUs are used. It can work - for CPU-only, but definitely fails using MPS (the Apple Silicon chip) - due to Gloo. + The DDP strategy works best when multiple GPUs are used. It can + work for CPU-only, but definitely fails using MPS (the Apple + Silicon chip) due to Gloo. Returns ------- Union[str, DDPStrategy] The strategy parameter for the Trainer. - """ if self.config.accelerator in ("cpu", "mps"): return "auto" @@ -623,8 +611,8 @@ def _get_peak_filenames( """ Get all matching peak file names from the path pattern. - Performs cross-platform path expansion akin to the Unix shell (glob, expand - user, expand vars). + Performs cross-platform path expansion akin to the Unix shell (glob, + expand user, expand vars). Parameters ---------- From 7fa5f6f68c17c6e55aec25abac14b7cd9c3d7d2d Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Sun, 10 Nov 2024 15:53:56 +0100 Subject: [PATCH 67/84] Reformat with black --- casanovo/data/db_utils.py | 13 +++++++------ casanovo/denovo/model.py | 2 +- casanovo/denovo/model_runner.py | 4 +--- tests/conftest.py | 1 - 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 55127cff..fb9255db 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -136,9 +136,11 @@ def _digest_fasta( ) ) # Merge proteins from duplicate peptides. - peptides = peptides.groupby("peptide")["protein"].apply( - lambda proteins: sorted(set(proteins)) - ).reset_index() + peptides = ( + peptides.groupby("peptide")["protein"] + .apply(lambda proteins: sorted(set(proteins))) + .reset_index() + ) # Calculate the mass of each peptide. mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") peptides["calc_mass"] = peptides["peptide"].apply(mass_calculator.mass) @@ -185,9 +187,8 @@ def get_candidates( ) upper_bound = shift_raw_mass * (1 + precursor_tol_ppm) lower_bound = shift_raw_mass * (1 - precursor_tol_ppm) - mask |= ( - (self.db_peptides["calc_mass"] >= lower_bound) - & (self.db_peptides["calc_mass"] <= upper_bound) + mask |= (self.db_peptides["calc_mass"] >= lower_bound) & ( + self.db_peptides["calc_mass"] <= upper_bound ) return self.db_peptides.index[mask] diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index d309d11c..1c577815 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1019,7 +1019,7 @@ def __init__(self, *args, **kwargs): def predict_step( self, batch: Tuple[torch.Tensor, torch.Tensor, np.ndarray, np.ndarray], - *args + *args, ) -> List[ms_io.PepSpecMatch]: """ A single prediction step. diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 4e61b164..30f86f24 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -352,9 +352,7 @@ def initialize_trainer(self, train: bool) -> None: self.trainer = pl.Trainer(**trainer_cfg) - def initialize_model( - self, train: bool, db_search: bool = False - ) -> None: + def initialize_model(self, train: bool, db_search: bool = False) -> None: """Initialize the Casanovo model. Parameters diff --git a/tests/conftest.py b/tests/conftest.py index 009c0737..a35c5834 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,7 +2,6 @@ import depthcharge import numpy as np -import pandas as pd import psims import pytest import yaml From 7a42e8b0d2209adeb6445c324938d1b45f78ddff Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Sun, 10 Nov 2024 16:02:52 +0100 Subject: [PATCH 68/84] Minor fix --- casanovo/data/db_utils.py | 2 -- casanovo/denovo/dataloaders.py | 8 ++++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index fb9255db..1ee9fab8 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -8,7 +8,6 @@ from typing import Dict, Iterator, Pattern, Set, Tuple import depthcharge.masses -import numba as nb import numpy as np import pandas as pd import pyteomics.fasta @@ -365,7 +364,6 @@ def _convert_from_modx( return swap_regex.sub(lambda x: swap_map[x.group()], seq) -@nb.njit def _to_neutral_mass(mz_mass: float, charge: int) -> float: """ Convert precursor m/z value to neutral mass. diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index f929b1e0..e9759eac 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -228,9 +228,9 @@ def prepare_batch( precursor_mzs = torch.tensor(precursor_mzs) precursor_charges = torch.tensor(precursor_charges) precursor_masses = (precursor_mzs - 1.007276) * precursor_charges - precursors = torch.hstack( + precursors = torch.vstack( [precursor_masses, precursor_charges, precursor_mzs] - ).float() + ).T.float() return spectra, precursors, np.asarray(spectrum_ids) @@ -274,8 +274,8 @@ def prepare_psm_batch( batch_precursors = [] batch_spectrum_ids = [] batch_peptides = [] - # FIXME: This can be optmized by using a sliding window instead of - # retrieving candidates for each spectrum indendently. + # FIXME: This can be optimized by using a sliding window instead of + # retrieving candidates for each spectrum independently. for i in range(len(batch)): candidate_pep = protein_database.get_candidates( precursors[i][2], precursors[i][1] From 17d58805419bf62b61bbd10a000d13327c66bde7 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Sun, 10 Nov 2024 17:37:22 +0100 Subject: [PATCH 69/84] Fix output name crash --- casanovo/casanovo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 17786793..3d1811d3 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -255,7 +255,7 @@ def db_search( runner.db_search( peak_path, fasta_path, - str((output_path / output_root).with_suffix(".mztab")), + str((output_path / output_root_name).with_suffix(".mztab")), ) utils.log_run_report(start_time=start_time, end_time=time.time()) From fff5ca418828727d190755b1ebc16d98a69dcb5e Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Sun, 10 Nov 2024 17:58:52 +0100 Subject: [PATCH 70/84] Fix AA score masking --- casanovo/denovo/model.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 1c577815..88f3aaca 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1149,12 +1149,12 @@ def _calc_match_score( per_aa_scores = batch_all_aa_scores[rows, cols, truth_aa_indices] per_aa_scores = per_aa_scores.cpu().detach().numpy() per_aa_scores[per_aa_scores == 0] += 1e-10 - score_mask = truth_aa_indices != 0 - per_aa_scores[~score_mask] = 0 + score_mask = (truth_aa_indices != 0).cpu().detach().numpy() peptide_scores, aa_scores = [], [] - for psm_score in per_aa_scores: - psm_score = np.trim_zeros(psm_score) - psm_aa_scores, psm_peptide_score = _aa_pep_score(psm_score, True) + for psm_score, psm_mask in zip(per_aa_scores, score_mask): + psm_aa_scores, psm_peptide_score = _aa_pep_score( + psm_score[psm_mask], True + ) peptide_scores.append(psm_peptide_score) aa_scores.append(psm_aa_scores) From d18d874301ea18c845db5483a81c4486a29f64c0 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Sun, 10 Nov 2024 18:13:51 +0100 Subject: [PATCH 71/84] Fix PSM export --- casanovo/denovo/model.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 88f3aaca..716dd747 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -2,6 +2,7 @@ import collections import heapq +import itertools import logging import warnings from pathlib import Path @@ -1062,10 +1063,11 @@ def predict_step( batch_aa_scores, psm_batch[3], ): + spectrum_i = tuple(spectrum_i) predictions_all[spectrum_i].append( ms_io.PepSpecMatch( sequence=peptide, - spectrum_id=tuple(spectrum_i), + spectrum_id=spectrum_i, peptide_score=peptide_score, charge=int(charge), calc_mz=self.peptide_mass_calculator.mass( @@ -1079,16 +1081,20 @@ def predict_step( ) ) # Filter the top-scoring prediction(s) for each spectrum. - predictions = [ - *( - sorted( - spectrum_predictions, - key=lambda p: p.peptide_score, - reverse=True, - )[: self.top_match] - for spectrum_predictions in predictions_all.values() + predictions = list( + itertools.chain.from_iterable( + [ + *( + sorted( + spectrum_predictions, + key=lambda p: p.peptide_score, + reverse=True, + )[: self.top_match] + for spectrum_predictions in predictions_all.values() + ) + ] ) - ] + ) return predictions def on_predict_batch_end( From b12abd6fc05d868253ad765540b69ef8b4625395 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Thu, 14 Nov 2024 14:19:22 +0100 Subject: [PATCH 72/84] Less verbose logging of skipped peptides --- casanovo/data/db_utils.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 1ee9fab8..81b9daf8 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -300,6 +300,7 @@ def _peptide_generator( "Enzyme %s not recognized. Interpreting as cleavage rule.", enzyme, ) + n_skipped = 0 if digestion == "non-specific": for header, seq in pyteomics.fasta.read(fasta_filename): protein = header.split()[0] @@ -311,7 +312,8 @@ def _peptide_generator( ): peptide = seq[i:j] if any(aa not in valid_aa for aa in peptide): - logger.warning( + n_skipped += 1 + logger.debug( "Skipping peptide with unknown amino acids: %s", peptide, ) @@ -329,12 +331,17 @@ def _peptide_generator( for peptide in peptides: if min_peptide_len <= len(peptide) <= max_peptide_len: if any(aa not in valid_aa for aa in peptide): - logger.warning( + n_skipped += 1 + logger.debug( "Skipping peptide with unknown amino acids: %s", peptide, ) else: yield peptide, protein + if n_skipped > 0: + logger.warning( + "Skipped %d peptides with unknown amino acids", n_skipped + ) def _convert_from_modx( From b577d594e38c3d25dd3dead874d8fdde6c6c39b8 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Thu, 14 Nov 2024 16:02:02 +0100 Subject: [PATCH 73/84] Appropriate end-of-run reporting --- casanovo/casanovo.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 3d1811d3..fef73a9b 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -196,9 +196,8 @@ def sequence( str((output_path / output_root_name).with_suffix(".mztab")), evaluate=evaluate, ) - psms = runner.writer.psms - utils.log_sequencing_report( - psms, start_time=start_time, end_time=time.time() + utils.log_annotate_report( + runner.writer.psms, start_time=start_time, end_time=time.time() ) @@ -257,7 +256,9 @@ def db_search( fasta_path, str((output_path / output_root_name).with_suffix(".mztab")), ) - utils.log_run_report(start_time=start_time, end_time=time.time()) + utils.log_annotate_report( + runner.writer.psms, start_time=start_time, end_time=time.time() + ) @main.command(cls=_SharedParams) From 510953c5c065f616f64c215483cfca8d05717d23 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Thu, 14 Nov 2024 16:04:43 +0100 Subject: [PATCH 74/84] Fix PSM export from de novo --- casanovo/denovo/model.py | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 716dd747..e5a22760 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -901,9 +901,7 @@ def on_validation_epoch_end(self) -> None: self._log_history() def on_predict_batch_end( - self, - outputs: List[Tuple[np.ndarray, List[str], torch.Tensor]], - *args, + self, outputs: List[ms_io.PepSpecMatch], *args ) -> None: """ Write the predicted peptide sequences and amino acid scores to @@ -911,28 +909,9 @@ def on_predict_batch_end( """ if self.out_writer is None: return - # Triply nested lists: results -> batch -> step -> spectrum. - for ( - spectrum_i, - charge, - precursor_mz, - peptide, - peptide_score, - aa_scores, - ) in outputs: - if len(peptide) == 0: - continue - self.out_writer.psms.append( - psm.PepSpecMatch( - sequence=peptide, - spectrum_id=tuple(spectrum_i), - peptide_score=peptide_score, - charge=int(charge), - calc_mz=precursor_mz, - exp_mz=self.peptide_mass_calculator.mass(peptide, charge), - aa_scores=aa_scores, - ) - ) + for pred in outputs: + if len(pred.sequence) > 0: + self.out_writer.psms.append(pred) def _log_history(self) -> None: """ From 3c69711b08045b81aa9637eb97950c0bbb4669d9 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Thu, 14 Nov 2024 16:05:33 +0100 Subject: [PATCH 75/84] Generalize end-of-run reporting --- casanovo/utils.py | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/casanovo/utils.py b/casanovo/utils.py index 6e4273e3..86e0748f 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -8,7 +8,7 @@ import socket import sys from datetime import datetime -from typing import Tuple, Dict, List, Optional, Iterable +from typing import Dict, Iterable, List, Optional, Tuple import numpy as np import pandas as pd @@ -18,7 +18,7 @@ from .data.psm import PepSpecMatch -SCORE_BINS = [0.0, 0.5, 0.9, 0.95, 0.99] +SCORE_BINS = (0.0, 0.5, 0.9, 0.95, 0.99) logger = logging.getLogger("casanovo") @@ -27,8 +27,8 @@ def n_workers() -> int: """ Get the number of workers to use for data loading. - This is the maximum number of CPUs allowed for the process, scaled for the - number of GPUs being used. + This is the maximum number of CPUs allowed for the process, scaled + for the number of GPUs being used. On Windows and MacOS, we only use the main process. See: https://discuss.pytorch.org/t/errors-when-using-num-workers-0-in-dataloader/97564/4 @@ -79,7 +79,7 @@ def split_version(version: str) -> Tuple[str, str, str]: def get_score_bins( - scores: pd.Series, score_bins: List[float] + scores: pd.Series, score_bins: Iterable[float] ) -> Dict[float, int]: """ Get binned confidence scores @@ -92,14 +92,14 @@ def get_score_bins( ---------- scores: pd.Series Series of assigned peptide scores. - score_bins: List[float] + score_bins: Iterable[float] Confidence scores to map. Returns ------- score_bin_dict: Dict[float, int] - Dictionary mapping each confidence score to the number of spectra - with a confidence greater than or equal to it. + Dictionary mapping each confidence score to the number of + spectra with a confidence greater than or equal to it. """ return {score: (scores >= score).sum() for score in score_bins} @@ -116,8 +116,8 @@ def get_peptide_lengths(sequences: pd.Series) -> np.ndarray: Returns ------- sequence_lengths: np.ndarray - Numpy array containing the length of each sequence, listed in the - same order that the sequences are provided in. + Numpy array containing the length of each sequence, listed in + the same order that the sequences are provided in. """ # Mass modifications do not contribute to sequence length # FIXME: If PTMs are represented in ProForma notation this filtering @@ -126,7 +126,7 @@ def get_peptide_lengths(sequences: pd.Series) -> np.ndarray: def get_report_dict( - results_table: pd.DataFrame, score_bins: List[float] = SCORE_BINS + results_table: pd.DataFrame, score_bins: Iterable[float] = SCORE_BINS ) -> Optional[Dict]: """ Generate sequencing run report @@ -134,15 +134,16 @@ def get_report_dict( Parameters ---------- results_table: pd.DataFrame - Parsed spectrum match table - score_bins: List[float], Optional - Confidence scores for creating confidence CMF, see get_score_bins + Parsed spectrum match table. + score_bins: Iterable[float], Optional + Confidence scores for creating confidence CMF, see + `get_score_bins`. Returns ------- report_gen: Dict Generated report represented as a dictionary, or None if no - sequencing predictions were logged + sequencing predictions were logged. """ if results_table.empty: return None @@ -195,28 +196,26 @@ def log_run_report( logger.info("Max GPU Memory Utilization: %d MiB", gpu_util >> 20) -def log_sequencing_report( +def log_annotate_report( predictions: List[PepSpecMatch], start_time: Optional[float] = None, end_time: Optional[float] = None, - score_bins: List[float] = SCORE_BINS, + score_bins: Iterable[float] = SCORE_BINS, ) -> None: """ - Log sequencing run report + Log run annotation report. Parameters ---------- - next_prediction : Tuple[ - str, Tuple[str, str], float, float, float, float, str - ] - PSM predictions + predictions: List[PepSpecMatch] + PSM predictions. start_time : Optional[float], default=None The start time of the sequencing run in seconds since the epoch. end_time : Optional[float], default=None The end time of the sequencing run in seconds since the epoch. - score_bins: List[float], Optional + score_bins: Iterable[float], Optional Confidence scores for creating confidence score distribution, - see get_score_bins + see `get_score_bins`. """ log_run_report(start_time=start_time, end_time=end_time) run_report = get_report_dict( From 15265048b78960c7b933d81a13a1a3e024d782d5 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Thu, 14 Nov 2024 16:05:54 +0100 Subject: [PATCH 76/84] Log additional information on spectra with no matching candidates --- casanovo/denovo/dataloaders.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index e9759eac..4db36b33 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -282,7 +282,11 @@ def prepare_psm_batch( ) if len(candidate_pep) == 0: logger.info( - "No candidate peptides found for spectrum %s", spectrum_ids[i] + "No candidate peptides found for spectrum %s with precursor " + "charge %d and precursor m/z %f", + spectrum_ids[i], + precursors[i][1], + precursors[i][2], ) else: batch_spectra.append( From f18332d48a2f65cde554ea2bfefab8ea10a7a416 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Thu, 14 Nov 2024 16:16:37 +0100 Subject: [PATCH 77/84] Fix linting issue --- casanovo/data/db_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 81b9daf8..516e91a4 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -341,7 +341,7 @@ def _peptide_generator( if n_skipped > 0: logger.warning( "Skipped %d peptides with unknown amino acids", n_skipped - ) + ) def _convert_from_modx( From a71c4404ea2220ba966e74669bb0b2ab0348f3c2 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Thu, 14 Nov 2024 16:41:15 +0100 Subject: [PATCH 78/84] Fix some testing warnings --- casanovo/denovo/model.py | 2 +- tests/test_integration.py | 4 ++-- tests/unit_tests/test_unit.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index e5a22760..68a8fcc5 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -18,7 +18,7 @@ from . import evaluate from .. import config -from ..data import ms_io, psm +from ..data import ms_io logger = logging.getLogger("casanovo") diff --git a/tests/test_integration.py b/tests/test_integration.py index eeeb498f..7dab1b5b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -177,8 +177,8 @@ def test_train_and_run( mztab = pyteomics.mztab.MzTab(str(output_filename)) filename = "small.mgf" # Verify that the input annotated peak file is listed in the metadata. - assert f"ms_run[1]-location" in mztab.metadata - assert mztab.metadata[f"ms_run[1]-location"].endswith(filename) + assert "ms_run[1]-location" in mztab.metadata + assert mztab.metadata["ms_run[1]-location"].endswith(filename) # Verify that the spectrum predictions are correct # and indexed according to the peak input file type. diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 0d4812f9..00617457 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -454,14 +454,14 @@ def test_aa_pep_score(): def test_peptide_generator_errors(residues_dict, tiny_fasta_file): - with pytest.raises(FileNotFoundError) as e_info: + with pytest.raises(FileNotFoundError): [ (a, b) for a, b in db_utils._peptide_generator( "fail.fasta", "trypsin", "full", 0, 5, 10, residues_dict ) ] - with pytest.raises(ValueError) as e_info: + with pytest.raises(ValueError): [ (a, b) for a, b in db_utils._peptide_generator( From 4aa257b1e43d944dda4160206a30e69e7b56c817 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Thu, 14 Nov 2024 17:03:08 +0100 Subject: [PATCH 79/84] Log digestion settings --- casanovo/data/db_utils.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 516e91a4..32d975bb 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -88,6 +88,13 @@ def __init__( max_peptide_len, set([aa[0] for aa in residues.keys() if aa[0].isalpha()]), ) + logger.info( + "Digesting FASTA file (enzyme = %s, digestion = %s, missed " + "cleavages = %d)...", + enzyme, + digestion, + missed_cleavages, + ) self.db_peptides = self._digest_fasta(peptide_generator) self.precursor_tolerance = precursor_tolerance self.isotope_error = isotope_error @@ -150,7 +157,7 @@ def _digest_fasta( peptides.set_index("peptide", inplace=True) logger.info( - "Digestion complete. %d peptides generated.", len(peptides) + "Digestion complete. %s peptides generated.", f"{len(peptides):,d}" ) return peptides From d54b66fa5242d50a625bcb5654db99c8d7a849a9 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Thu, 14 Nov 2024 17:05:16 +0100 Subject: [PATCH 80/84] Reduce logging level for spectra without candidates --- casanovo/denovo/dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 4db36b33..cdbf71bf 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -281,7 +281,7 @@ def prepare_psm_batch( precursors[i][2], precursors[i][1] ) if len(candidate_pep) == 0: - logger.info( + logger.debug( "No candidate peptides found for spectrum %s with precursor " "charge %d and precursor m/z %f", spectrum_ids[i], From d97e251428a6f6108d4d3c954d648efde4ce82fd Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Mon, 18 Nov 2024 16:30:54 +0100 Subject: [PATCH 81/84] Round peptide masses for consistent sorting --- casanovo/data/db_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 32d975bb..95ef2d13 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -149,7 +149,10 @@ def _digest_fasta( ) # Calculate the mass of each peptide. mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") - peptides["calc_mass"] = peptides["peptide"].apply(mass_calculator.mass) + peptides["calc_mass"] = ( + peptides["peptide"].apply(mass_calculator.mass) + .round(5) + ) # Sort by peptide mass and index by peptide sequence. peptides.sort_values( by=["calc_mass", "peptide"], ascending=True, inplace=True From db5e00f063829e12f731a1106f61ab0e8bab1788 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Mon, 18 Nov 2024 16:39:37 +0100 Subject: [PATCH 82/84] Fox linting --- casanovo/data/db_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 95ef2d13..d3670930 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -150,8 +150,7 @@ def _digest_fasta( # Calculate the mass of each peptide. mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") peptides["calc_mass"] = ( - peptides["peptide"].apply(mass_calculator.mass) - .round(5) + peptides["peptide"].apply(mass_calculator.mass).round(5) ) # Sort by peptide mass and index by peptide sequence. peptides.sort_values( From 1e565c4acaead600e67c0b1c59ec51ca7ebb2c57 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Mon, 18 Nov 2024 20:31:56 +0100 Subject: [PATCH 83/84] Remove superfluous PSM export --- casanovo/denovo/model.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 68a8fcc5..f350f3b3 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1076,14 +1076,6 @@ def predict_step( ) return predictions - def on_predict_batch_end( - self, outputs: List[ms_io.PepSpecMatch], *args - ) -> None: - """ - Write the database search results to the output file. - """ - self.out_writer.psms.extend(outputs) - def _calc_match_score( batch_all_aa_scores: torch.Tensor, From 18999cf2f0e682437854fe7f0db746321e27f641 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Mon, 18 Nov 2024 20:37:44 +0100 Subject: [PATCH 84/84] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 240185d1..c73eec8b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Added +- Casanovo-DB mode (`casanovo db_search`) to use Casanovo as a learned score function for sequence database searching (given a FASTA protein database). - During training, model checkpoints will be saved at the end of each training epoch in addition to the checkpoints saved at the end of every validation run. - Besides as a local file, model weights can be specified from a URL. Upon initial download, the weights file is cached for future re-use. - Training and optimizer metrics can now be logged to a CSV file by setting the `log_metrics` config file option to true - the CSV file will be written to under a sub-directory of the output directory named `csv_logs`.