diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d6e77116..d94fc040 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/psf/black - rev: stable # Replace by any tag/version: https://github.com/psf/black/tags + rev: 22.6.0 # Replace by any tag/version: https://github.com/psf/black/tags hooks: - id: black language_version: python3 # Should be a command that runs python3.6+ diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 61617f10..46eed044 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -5,23 +5,69 @@ import os from casanovo.denovo import train, evaluate, denovo -#Required options +# Required options @click.command() -@click.option("--mode", required=True, default='eval', help="Choose on a high level what the program will do. \"train\" will train a model from scratch or continue training a pre-trained model. \"eval\" will evaluate de novo sequencing performance of a pre-trained model (peptide annotations are needed for spectra). \"denovo\" will run de novo sequencing without evaluation (specificy directory path for output csv file with de novo sequences).", type=click.Choice(['train', 'eval', 'denovo'])) -@click.option("--model_path", required=True, help="Specify path to pre-trained model weights (.ckpt file) for testing or to continue to train.", type=click.Path(exists=True, dir_okay=False, file_okay=True)) -#Base options -@click.option("--train_data_path", help="Specify path to .mgf files to be used as training data", type=click.Path(exists=True, dir_okay=True, file_okay=False)) -@click.option("--val_data_path", help="Specify path to .mgf files to be used as validation data", type=click.Path(exists=True, dir_okay=True, file_okay=False)) -@click.option("--test_data_path", help="Specify path to .mgf files to be used as test data", type=click.Path(exists=True, dir_okay=True, file_okay=False)) -@click.option("--config_path", help="Specify path to custom config file which includes data and model related options. If not included, the default config.yaml will be used.", type=click.Path(exists=True, dir_okay=False, file_okay=True)) -@click.option("--output_path", help="Specify path to output de novo sequences. Output format is .csv", type=click.Path(exists=True, dir_okay=True, file_okay=False)) -#De Novo sequencing options -@click.option("--preprocess_spec", default=None, help="True if spectra data should be preprocessed, False if using preprocessed data.", type=click.BOOL) -@click.option("--num_workers", default=None, help="Number of workers to use for spectra reading.", type=click.INT) -@click.option("--gpus", default=(), help="Specify gpus for usage. For multiple gpus, use format: --gpus=0 --gpus=1 --gpus=2... etc. etc.", type=click.INT, multiple=True) - +@click.option( + "--mode", + required=True, + default="eval", + help='Choose on a high level what the program will do. "train" will train a model from scratch or continue training a pre-trained model. "eval" will evaluate de novo sequencing performance of a pre-trained model (peptide annotations are needed for spectra). "denovo" will run de novo sequencing without evaluation (specificy directory path for output csv file with de novo sequences).', + type=click.Choice(["train", "eval", "denovo"]), +) +@click.option( + "--model_path", + required=True, + help="Specify path to pre-trained model weights (.ckpt file) for testing or to continue to train.", + type=click.Path(exists=True, dir_okay=False, file_okay=True), +) +# Base options +@click.option( + "--train_data_path", + help="Specify path to .mgf files to be used as training data", + type=click.Path(exists=True, dir_okay=True, file_okay=False), +) +@click.option( + "--val_data_path", + help="Specify path to .mgf files to be used as validation data", + type=click.Path(exists=True, dir_okay=True, file_okay=False), +) +@click.option( + "--test_data_path", + help="Specify path to .mgf files to be used as test data", + type=click.Path(exists=True, dir_okay=True, file_okay=False), +) +@click.option( + "--config_path", + help="Specify path to custom config file which includes data and model related options. If not included, the default config.yaml will be used.", + type=click.Path(exists=True, dir_okay=False, file_okay=True), +) +@click.option( + "--output_path", + help="Specify path to output de novo sequences. Output format is .csv", + type=click.Path(exists=True, dir_okay=True, file_okay=False), +) +# De Novo sequencing options +@click.option( + "--preprocess_spec", + default=None, + help="True if spectra data should be preprocessed, False if using preprocessed data.", + type=click.BOOL, +) +@click.option( + "--num_workers", + default=None, + help="Number of workers to use for spectra reading.", + type=click.INT, +) +@click.option( + "--gpus", + default=(), + help="Specify gpus for usage. For multiple gpus, use format: --gpus=0 --gpus=1 --gpus=2... etc. etc.", + type=click.INT, + multiple=True, +) def main( - #Req + base vars + # Req + base vars mode, model_path, train_data_path, @@ -29,62 +75,63 @@ def main( test_data_path, config_path, output_path, - #De Novo vars + # De Novo vars preprocess_spec, num_workers, - gpus + gpus, ): """ The command line function for casanovo. De Novo Mass Spectrometry Peptide Sequencing with a Transformer Model. - + \b Training option requirements: mode, model_path, train_data_path, val_data_path, config_path - + \b Evaluation option requirements: mode, model_path, test_data_path, config_path - + \b De Novo option requirements: mode, model_path, test_data_path, config_path, output_path """ logging.basicConfig( - level=logging.INFO, - format="%(levelname)s: %(message)s", -) + level=logging.INFO, + format="%(levelname)s: %(message)s", + ) if config_path == None: - abs_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.yaml') + abs_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "config.yaml" + ) with open(abs_path) as f: config = yaml.safe_load(f) else: with open(config_path) as f: config = yaml.safe_load(f) - if(preprocess_spec != None): - config['preprocess_spec'] = preprocess_spec - if(num_workers != None): - config['num_workers'] = num_workers - if(gpus != ()): - config['gpus'] = gpus - if mode == 'train': + if preprocess_spec != None: + config["preprocess_spec"] = preprocess_spec + if num_workers != None: + config["num_workers"] = num_workers + if gpus != (): + config["gpus"] = gpus + if mode == "train": - logging.info('Training Casanovo...') + logging.info("Training Casanovo...") train(train_data_path, val_data_path, model_path, config) - - elif mode == 'eval': - - logging.info('Evaluating Casanovo...') + + elif mode == "eval": + + logging.info("Evaluating Casanovo...") evaluate(test_data_path, model_path, config) - elif mode == 'denovo': - - logging.info('De novo sequencing with Casanovo...') - denovo(test_data_path, model_path, config, output_path) - + elif mode == "denovo": + + logging.info("De novo sequencing with Casanovo...") + denovo(test_data_path, model_path, config, output_path) + pass if __name__ == "__main__": main() - diff --git a/casanovo/data/__init__.py b/casanovo/data/__init__.py index a0363212..8d62cf78 100644 --- a/casanovo/data/__init__.py +++ b/casanovo/data/__init__.py @@ -2,4 +2,4 @@ from .datasets import ( SpectrumDataset, AnnotatedSpectrumDataset, -) \ No newline at end of file +) diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py index b694ac80..4550e425 100644 --- a/casanovo/data/datasets.py +++ b/casanovo/data/datasets.py @@ -20,19 +20,19 @@ class SpectrumDataset(Dataset): The minimum m/z to include. The default is 140 m/z, in order to exclude TMT and iTRAQ reporter ions. max_mz : float, optional - The maximum m/z to include. + The maximum m/z to include. min_intensity : float, optional Remove peaks whose intensity is below `min_intensity` percentage of the intensity of the most intense peak fragment_tol_mass : float, optional Fragment mass tolerance around the precursor mass in Da to remove the - precursor peak. + precursor peak. random_state : int or RandomState, optional. The numpy random state. ``None`` leaves mass spectra in the order they were parsed. preprocess_spec : bool, optional Preprocess the provided spectra - + Attributes ---------- n_peaks : int @@ -40,13 +40,13 @@ class SpectrumDataset(Dataset): min_mz : float The minimum m/z to consider for each mass spectrum. max_mz : float - The maximum m/z to include. + The maximum m/z to include. min_intensity : float Remove peaks whose intensity is below `min_intensity` percentage of the intensity of the most intense peak fragment_tol_mass : float Fragment mass tolerance around the precursor mass in Da to remove the - precursor peak. + precursor peak. n_spectra : int index : depthcharge.data.SpectrumIndex rng : numpy.random.Generator @@ -60,9 +60,9 @@ def __init__( min_mz=140, max_mz=2500, min_intensity=0.01, - fragment_tol_mass=2, + fragment_tol_mass=2, random_state=None, - preprocess_spec=False + preprocess_spec=False, ): """Initialize a SpectrumDataset""" super().__init__() @@ -70,7 +70,7 @@ def __init__( self.min_mz = min_mz self.max_mz = max_mz self.min_intensity = min_intensity - self.fragment_tol_mass = fragment_tol_mass + self.fragment_tol_mass = fragment_tol_mass self.rng = np.random.default_rng(random_state) self._index = spectrum_index self.preprocess_spec = preprocess_spec @@ -100,17 +100,19 @@ def __getitem__(self, idx): The unique identifier for spectrum based on its order in the original mgf file """ mz_array, int_array, prec_mz, prec_charge = self.index[idx] - + if self.preprocess_spec == True: - spec = self._process_peaks(mz_array, int_array, prec_mz, prec_charge) + spec = self._process_peaks( + mz_array, int_array, prec_mz, prec_charge + ) else: spec = torch.tensor(np.array([mz_array, int_array])).T.float() - + if not spec.sum(): - spec = torch.tensor([[0, 1]]).float() - - spectrum_order_id = f'{idx}' - + spec = torch.tensor([[0, 1]]).float() + + spectrum_order_id = f"{idx}" + return spec, prec_mz, prec_charge, spectrum_order_id def _get_non_precursor_peak_mask( @@ -149,7 +151,7 @@ def _get_non_precursor_peak_mask( mask = np.full_like(mz, True, np.bool_) mz_i = remove_i = 0 while mz_i < len(mz) and remove_i < len(remove_mz): - md = mz[mz_i] - remove_mz[remove_i] # in Da + md = mz[mz_i] - remove_mz[remove_i] # in Da if md < -fragment_tol_mass: mz_i += 1 elif md > fragment_tol_mass: @@ -158,9 +160,11 @@ def _get_non_precursor_peak_mask( mask[mz_i] = False mz_i += 1 - return mask - - def _get_filter_intensity_mask(self, intensity, min_intensity, max_num_peaks): + return mask + + def _get_filter_intensity_mask( + self, intensity, min_intensity, max_num_peaks + ): """ Get a mask to remove low-intensity peaks and retain only the given number of most intense peaks. @@ -195,7 +199,7 @@ def _get_filter_intensity_mask(self, intensity, min_intensity, max_num_peaks): intensity_idx[max(start_i, len(intensity_idx) - max_num_peaks) :] ] = True return mask - + def _process_peaks(self, mz_array, int_array, prec_mz, prec_charge): """Choose the top n peaks and normalize the spectrum intensities. @@ -208,7 +212,7 @@ def _process_peaks(self, mz_array, int_array, prec_mz, prec_charge): precursor_mz : float The m/z of the precursor. precursor_charge : int - The charge of the precursor. + The charge of the precursor. Returns ------- @@ -216,13 +220,13 @@ def _process_peaks(self, mz_array, int_array, prec_mz, prec_charge): The mass spectrum where ``spectrum[:, 0]`` are the m/z values and ``spectrum[:, 1]`` are their associated intensities. """ - #Set m/z range for fragments + # Set m/z range for fragments if self.min_mz is not None: keep = (mz_array >= self.min_mz) & (mz_array <= self.max_mz) mz_array = mz_array[keep] int_array = int_array[keep] - - #Remove fragment peak(s) close to the precursor m/z + + # Remove fragment peak(s) close to the precursor m/z neutral_mass = (prec_mz - 1.0072766) * prec_charge peak_mask = self._get_non_precursor_peak_mask( mz_array, @@ -232,13 +236,17 @@ def _process_peaks(self, mz_array, int_array, prec_mz, prec_charge): ) mz_array = mz_array[peak_mask] int_array = int_array[peak_mask] - - #Remove low-intensity fragment peaks and keep a maximum pre-specified number of peaks - top_p = self._get_filter_intensity_mask(intensity=int_array, min_intensity=self.min_intensity, max_num_peaks=self.n_peaks) + + # Remove low-intensity fragment peaks and keep a maximum pre-specified number of peaks + top_p = self._get_filter_intensity_mask( + intensity=int_array, + min_intensity=self.min_intensity, + max_num_peaks=self.n_peaks, + ) mz_array = mz_array[top_p] int_array = int_array[top_p] - - #Square root normalize the peak intensities + + # Square root normalize the peak intensities int_array = np.sqrt(int_array) int_array = int_array / np.linalg.norm(int_array) @@ -264,6 +272,7 @@ def rng(self, seed): """Set the numpy random number generator.""" self._rng = np.random.default_rng(seed) + class AnnotatedSpectrumDataset(SpectrumDataset): """Parse and retrieve collections of mass spectra @@ -278,13 +287,13 @@ class AnnotatedSpectrumDataset(SpectrumDataset): The minimum m/z to include. The default is 140 m/z, in order to exclude TMT and iTRAQ reporter ions. max_mz : float - The maximum m/z to include. + The maximum m/z to include. min_intensity : float Remove peaks whose intensity is below `min_intensity` percentage of the intensity of the most intense peak fragment_tol_mass : float Fragment mass tolerance around the precursor mass in Da to remove the - precursor peak. + precursor peak. random_state : int or RandomState, optional. The numpy random state. ``None`` leaves mass spectra in the order they were parsed. @@ -298,13 +307,13 @@ class AnnotatedSpectrumDataset(SpectrumDataset): min_mz : float The minimum m/z to consider for each mass spectrum. max_mz : float - The maximum m/z to include. + The maximum m/z to include. min_intensity : float Remove peaks whose intensity is below `min_intensity` percentage of the intensity of the most intense peak fragment_tol_mass : float Fragment mass tolerance around the precursor mass in Da to remove the - precursor peak. + precursor peak. n_spectra : int index : depthcharge.data.SpectrumIndex rng : numpy.random.Generator @@ -318,9 +327,9 @@ def __init__( min_mz=140, max_mz=2500, min_intensity=0.01, - fragment_tol_mass=2, + fragment_tol_mass=2, random_state=None, - preprocess_spec=False + preprocess_spec=False, ): """Initialize an AnnotatedSpectrumDataset""" super().__init__( @@ -329,9 +338,9 @@ def __init__( min_mz=min_mz, max_mz=max_mz, min_intensity=min_intensity, - fragment_tol_mass=fragment_tol_mass, + fragment_tol_mass=fragment_tol_mass, random_state=random_state, - preprocess_spec=preprocess_spec + preprocess_spec=preprocess_spec, ) def __getitem__(self, idx): @@ -356,7 +365,9 @@ def __getitem__(self, idx): """ mz_array, int_array, prec_mz, prec_charge, pep = self.index[idx] if self.preprocess_spec == True: - spec = self._process_peaks(mz_array, int_array, prec_mz, prec_charge) + spec = self._process_peaks( + mz_array, int_array, prec_mz, prec_charge + ) else: spec = torch.tensor(np.array([mz_array, int_array])).T.float() return spec, prec_mz, prec_charge, pep diff --git a/casanovo/denovo/__init__.py b/casanovo/denovo/__init__.py index c845d2a5..a90f7d00 100644 --- a/casanovo/denovo/__init__.py +++ b/casanovo/denovo/__init__.py @@ -2,4 +2,3 @@ from .model import Spec2Pep from .dataloaders import DeNovoDataModule from .train_test import train, denovo, evaluate - diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 82c9e97e..24a68e14 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -32,13 +32,13 @@ class DeNovoDataModule(pl.LightningDataModule): The minimum m/z to include. The default is 140 m/z, in order to exclude TMT and iTRAQ reporter ions. max_mz : float, optional - The maximum m/z to include. + The maximum m/z to include. min_intensity : float, optional Remove peaks whose intensity is below `min_intensity` percentage of the intensity of the most intense peak fragment_tol_mass : float, optional Fragment mass tolerance around the precursor mass in Da to remove the - precursor peak. + precursor peak. num_workers : int, optional The number of workers to use for data loading. By default, the number of available CPU cores on the current machine is used. @@ -57,10 +57,10 @@ def __init__( min_mz=140, max_mz=2500, min_intensity=0.01, - fragment_tol_mass=2, + fragment_tol_mass=2, num_workers=None, random_state=None, - preprocess_spec=False + preprocess_spec=False, ): """Initialize the PairedSpectrumDataModule.""" super().__init__() @@ -72,7 +72,7 @@ def __init__( self.min_mz = min_mz self.max_mz = max_mz self.min_intensity = min_intensity - self.fragment_tol_mass = fragment_tol_mass + self.fragment_tol_mass = fragment_tol_mass self.num_workers = num_workers self.rng = np.random.default_rng(random_state) self.preprocess_spec = preprocess_spec @@ -102,7 +102,7 @@ def setup(self, stage=None, annotated=True): min_mz=self.min_mz, max_mz=self.max_mz, min_intensity=self.min_intensity, - fragment_tol_mass=self.fragment_tol_mass, + fragment_tol_mass=self.fragment_tol_mass, preprocess_spec=self.preprocess_spec, ) if self.train_index is not None: @@ -114,29 +114,29 @@ def setup(self, stage=None, annotated=True): self.valid_dataset = make_dataset(self.valid_index) if stage in (None, "test"): - if annotated == True: + if annotated == True: make_dataset = partial( AnnotatedSpectrumDataset, n_peaks=self.n_peaks, min_mz=self.min_mz, max_mz=self.max_mz, min_intensity=self.min_intensity, - fragment_tol_mass=self.fragment_tol_mass, - preprocess_spec=self.preprocess_spec - ) - else: + fragment_tol_mass=self.fragment_tol_mass, + preprocess_spec=self.preprocess_spec, + ) + else: make_dataset = partial( SpectrumDataset, n_peaks=self.n_peaks, min_mz=self.min_mz, max_mz=self.max_mz, min_intensity=self.min_intensity, - fragment_tol_mass=self.fragment_tol_mass, - preprocess_spec=self.preprocess_spec - ) + fragment_tol_mass=self.fragment_tol_mass, + preprocess_spec=self.preprocess_spec, + ) if self.test_index is not None: self.test_dataset = make_dataset(self.test_index) - + def _make_loader(self, dataset): """Create a PyTorch DataLoader. @@ -190,7 +190,7 @@ def prepare_batch(batch): precursors : torch.Tensor of shape (batch_size, 2) The precursor mass and charge state. sequence_or_ids : list of str - The peptide sequence annotations in training, the spectrum identifier in de novo sequencing + The peptide sequence annotations in training, the spectrum identifier in de novo sequencing """ spec, mz, charge, sequence_or_ids = list(zip(*batch)) charge = torch.tensor(charge) diff --git a/casanovo/denovo/evaluate.py b/casanovo/denovo/evaluate.py index 4adcc8d6..caba1f68 100644 --- a/casanovo/denovo/evaluate.py +++ b/casanovo/denovo/evaluate.py @@ -4,9 +4,10 @@ from depthcharge.masses import PeptideMass + def best_aa_match(orig_seq, pred_seq, aa_dict): """ - Find the matching amino acids of an original and predicted peptide sequence if either their prefix or suffix match + Find the matching amino acids of an original and predicted peptide sequence if either their prefix or suffix match Parameters ---------- @@ -16,45 +17,66 @@ def best_aa_match(orig_seq, pred_seq, aa_dict): List of amino acids in the predicted peptide aa_dict: dict Dictionary of amino acid masses - + Returns ------- aa_match: list Binary list of matches over predicted peptide pep_match: int - 1 if all amino acid in two sequences match + 1 if all amino acid in two sequences match + + """ - """ - cum_aa_threshold = 0.5 single_aa_threshold = 0.1 - aa_match = [] pep_match = 0 cnt_pred = 0 cnt_orig = 0 - + orig_aa_mass_list = [aa_dict[aa] for aa in orig_seq] pred_aa_mass_list = [aa_dict[aa] for aa in pred_seq] - - orig_prefix_mass_list = [0]+list(np.cumsum(orig_aa_mass_list))[:-1] - orig_suffix_mass_list = list(reversed(np.cumsum(list(reversed(orig_aa_mass_list)))[:-1]))+ [0] - - - pred_prefix_mass_list = [0]+list(np.cumsum(pred_aa_mass_list))[:-1] - pred_suffix_mass_list = list(reversed(np.cumsum(list(reversed(pred_aa_mass_list)))[:-1]))+ [0] - - + + orig_prefix_mass_list = [0] + list(np.cumsum(orig_aa_mass_list))[:-1] + orig_suffix_mass_list = list( + reversed(np.cumsum(list(reversed(orig_aa_mass_list)))[:-1]) + ) + [0] + + pred_prefix_mass_list = [0] + list(np.cumsum(pred_aa_mass_list))[:-1] + pred_suffix_mass_list = list( + reversed(np.cumsum(list(reversed(pred_aa_mass_list)))[:-1]) + ) + [0] + while cnt_pred < len(pred_seq) and cnt_orig < len(orig_seq): - + pred_aa_mass = aa_dict[pred_seq[cnt_pred]] orig_aa_mass = aa_dict[orig_seq[cnt_orig]] - - if abs(mass_diff(pred_prefix_mass_list[cnt_pred],orig_prefix_mass_list[cnt_orig], mode_is_da = True)) < cum_aa_threshold or abs(mass_diff(pred_suffix_mass_list[cnt_pred],orig_suffix_mass_list[cnt_orig], mode_is_da = True)) < cum_aa_threshold: - if abs(mass_diff(pred_aa_mass, orig_aa_mass, mode_is_da = True)) < single_aa_threshold: + if ( + abs( + mass_diff( + pred_prefix_mass_list[cnt_pred], + orig_prefix_mass_list[cnt_orig], + mode_is_da=True, + ) + ) + < cum_aa_threshold + or abs( + mass_diff( + pred_suffix_mass_list[cnt_pred], + orig_suffix_mass_list[cnt_orig], + mode_is_da=True, + ) + ) + < cum_aa_threshold + ): + + if ( + abs(mass_diff(pred_aa_mass, orig_aa_mass, mode_is_da=True)) + < single_aa_threshold + ): aa_match += [1] else: aa_match += [0] @@ -62,23 +84,29 @@ def best_aa_match(orig_seq, pred_seq, aa_dict): cnt_pred += 1 cnt_orig += 1 - - elif mass_diff(pred_prefix_mass_list[cnt_pred],orig_prefix_mass_list[cnt_orig], mode_is_da = True) > 0: + elif ( + mass_diff( + pred_prefix_mass_list[cnt_pred], + orig_prefix_mass_list[cnt_orig], + mode_is_da=True, + ) + > 0 + ): cnt_orig += 1 - + else: cnt_pred += 1 aa_match += [0] - + if sum(aa_match) == len(orig_seq) and len(pred_seq) == len(orig_seq): pep_match = 1 return aa_match, pep_match - + def find_aa_match_single_pep(orig_seq, pred_seq, aa_dict): """ - Find the matching amino acids of an original and predicted peptide sequence as described in DeepNovo. + Find the matching amino acids of an original and predicted peptide sequence as described in DeepNovo. Parameters ---------- @@ -88,13 +116,13 @@ def find_aa_match_single_pep(orig_seq, pred_seq, aa_dict): List of amino acids in the predicted peptide aa_dict: dict Dictionary of amino acid masses - + Returns ------- aa_match: list Binary list of c pep_match: int - 1 if all amino acid in two sequences match + 1 if all amino acid in two sequences match """ @@ -109,15 +137,21 @@ def find_aa_match_single_pep(orig_seq, pred_seq, aa_dict): cnt_pred = 0 cnt_orig = 0 - + while cnt_pred < len(pred_seq) and cnt_orig < len(orig_seq): - + pred_aa_mass = aa_dict[pred_seq[cnt_pred]] orig_aa_mass = aa_dict[orig_seq[cnt_orig]] - - if abs(mass_diff(pred_cum_aa_mass,orig_cum_aa_mass, mode_is_da = True)) < cum_aa_threshold: - if abs(mass_diff(pred_aa_mass, orig_aa_mass, mode_is_da = True)) < single_aa_threshold: + if ( + abs(mass_diff(pred_cum_aa_mass, orig_cum_aa_mass, mode_is_da=True)) + < cum_aa_threshold + ): + + if ( + abs(mass_diff(pred_aa_mass, orig_aa_mass, mode_is_da=True)) + < single_aa_threshold + ): aa_match += [1] else: aa_match += [0] @@ -127,25 +161,27 @@ def find_aa_match_single_pep(orig_seq, pred_seq, aa_dict): pred_cum_aa_mass += pred_aa_mass orig_cum_aa_mass += orig_aa_mass - - elif mass_diff(pred_cum_aa_mass,orig_cum_aa_mass, mode_is_da = True) > 0: + + elif ( + mass_diff(pred_cum_aa_mass, orig_cum_aa_mass, mode_is_da=True) > 0 + ): cnt_orig += 1 orig_cum_aa_mass += orig_aa_mass - + else: cnt_pred += 1 pred_cum_aa_mass += pred_aa_mass aa_match += [0] - + if sum(aa_match) == len(orig_seq) and len(pred_seq) == len(orig_seq): pep_match = 1 return aa_match, pep_match -def match_aa(orig_seq, pred_seq, aa_dict, eval_direction = 'best'): +def match_aa(orig_seq, pred_seq, aa_dict, eval_direction="best"): """ - Find the matching amino acids of an original and predicted peptide + Find the matching amino acids of an original and predicted peptide Parameters ---------- @@ -157,43 +193,47 @@ def match_aa(orig_seq, pred_seq, aa_dict, eval_direction = 'best'): Dictionary of amino acid masses eval_direction: str, default: 'best' Direction of evaluation while finding amino acid matches, e.g. 'forward', 'backward', 'best' - + Returns ------- aa_match: list Binary list of c pep_match: int - 1 if all amino acid in two sequences match + 1 if all amino acid in two sequences match """ - - if eval_direction == 'best': + + if eval_direction == "best": aa_match, pep_match = best_aa_match(orig_seq, pred_seq, aa_dict) n_mismatch_aa = len(pred_seq) - len(aa_match) aa_match += n_mismatch_aa * [0] - - - elif eval_direction == 'forward': - aa_match, pep_match = find_aa_match_single_pep(orig_seq,pred_seq, aa_dict) - + + elif eval_direction == "forward": + aa_match, pep_match = find_aa_match_single_pep( + orig_seq, pred_seq, aa_dict + ) + n_mismatch_aa = len(pred_seq) - len(aa_match) aa_match += n_mismatch_aa * [0] - - elif eval_direction == 'backward': - reverse_aa_match, pep_match = find_aa_match_single_pep(list(reversed(orig_seq)), - list(reversed(pred_seq)), aa_dict) - + + elif eval_direction == "backward": + reverse_aa_match, pep_match = find_aa_match_single_pep( + list(reversed(orig_seq)), list(reversed(pred_seq)), aa_dict + ) + aa_match = list(reversed(reverse_aa_match)) n_mismatch_aa = len(pred_seq) - len(aa_match) aa_match = n_mismatch_aa * [0] + aa_match - - + return aa_match, pep_match -def batch_aa_match(pred_pep_seqs, true_pep_seqs, aa_dict, eval_direction = 'best'): + +def batch_aa_match( + pred_pep_seqs, true_pep_seqs, aa_dict, eval_direction="best" +): """ - Find the matching amino acids of an original and predicted peptide + Find the matching amino acids of an original and predicted peptide Parameters ---------- @@ -205,16 +245,16 @@ def batch_aa_match(pred_pep_seqs, true_pep_seqs, aa_dict, eval_direction = 'best Dictionary of amino acid masses eval_direction: str, default: 'best' Direction of evaluation while finding amino acid matches, e.g. 'forward', 'backward', 'best' - + Returns ------- all_aa_match: list Binary list of lists corresponding to amino acid matches for all predicted peptides orig_total_num_aa: int - Total number of amino acids in the ground truth peptide labels + Total number of amino acids in the ground truth peptide labels pred_total_num_aa: int - Total number of amino acids in the predicted peptide labels + Total number of amino acids in the predicted peptide labels """ @@ -223,21 +263,26 @@ def batch_aa_match(pred_pep_seqs, true_pep_seqs, aa_dict, eval_direction = 'best all_aa_match = [] for pred_ind in range(len(pred_pep_seqs)): - + pred = re.split(r"(?<=.)(?=[A-Z])", pred_pep_seqs[pred_ind]) orig = re.split(r"(?<=.)(?=[A-Z])", true_pep_seqs[pred_ind]) orig_total_num_aa += len(orig) pred_total_num_aa += len(pred) - aa_match, pep_match = match_aa(orig, pred, aa_dict, eval_direction=eval_direction) + aa_match, pep_match = match_aa( + orig, pred, aa_dict, eval_direction=eval_direction + ) all_aa_match += [(aa_match, pep_match)] - + return all_aa_match, orig_total_num_aa, pred_total_num_aa - -def calc_eval_metrics(aa_match_binary_list, orig_total_num_aa, pred_total_num_aa): + + +def calc_eval_metrics( + aa_match_binary_list, orig_total_num_aa, pred_total_num_aa +): """ Calculate evaluation metrics using amino acid matches - + Parameters ---------- aa_match_binary_list : list of lists @@ -251,25 +296,32 @@ def calc_eval_metrics(aa_match_binary_list, orig_total_num_aa, pred_total_num_aa aa_precision: float Number of correct aa predictions divided by all predicted aa aa_recall: float - Number of correct aa predictions divided by all original aa + Number of correct aa predictions divided by all original aa pep_recall: float - Number of correct peptide predictions divided by all original peptide - """ - - correct_aa_count = sum([sum(pred_tuple[0]) for pred_tuple in aa_match_binary_list]) - aa_recall = correct_aa_count/(orig_total_num_aa+1e-8) - aa_precision = correct_aa_count/(pred_total_num_aa+1e-8) - pep_recall = sum([pred_tuple[1] for pred_tuple in aa_match_binary_list])/(len(aa_match_binary_list)+1e-8) - + Number of correct peptide predictions divided by all original peptide + """ + + correct_aa_count = sum( + [sum(pred_tuple[0]) for pred_tuple in aa_match_binary_list] + ) + aa_recall = correct_aa_count / (orig_total_num_aa + 1e-8) + aa_precision = correct_aa_count / (pred_total_num_aa + 1e-8) + pep_recall = sum( + [pred_tuple[1] for pred_tuple in aa_match_binary_list] + ) / (len(aa_match_binary_list) + 1e-8) + return aa_precision, aa_recall, pep_recall -def aa_precision_recall_with_threshold(correct_aa_confidences, all_aa_confidences, num_original_aa, threshold): + +def aa_precision_recall_with_threshold( + correct_aa_confidences, all_aa_confidences, num_original_aa, threshold +): """ Calculate precision and recall for the given amino acid confidence score threshold - + Parameters ---------- - correct_aa_confidences : list + correct_aa_confidences : list List of confidence scores for correct amino acids predictions all_aa_confidences : int List of confidence scores for all amino acids prediction @@ -277,19 +329,19 @@ def aa_precision_recall_with_threshold(correct_aa_confidences, all_aa_confidence Number of amino acids in the predicted peptide sequences threshold : float Amino acid confidence score threshold - + Returns ------- aa_precision: float Number of correct aa predictions divided by all predicted aa aa_recall: float - Number of correct aa predictions divided by all original aa - """ - - correct_aa = sum([conf>=threshold for conf in correct_aa_confidences]) - predicted_aa = sum([conf>=threshold for conf in all_aa_confidences]) - - aa_precision = correct_aa/predicted_aa - aa_recall = correct_aa/num_original_aa - - return aa_precision, aa_recall \ No newline at end of file + Number of correct aa predictions divided by all original aa + """ + + correct_aa = sum([conf >= threshold for conf in correct_aa_confidences]) + predicted_aa = sum([conf >= threshold for conf in all_aa_confidences]) + + aa_precision = correct_aa / predicted_aa + aa_recall = correct_aa / num_original_aa + + return aa_precision, aa_recall diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 2c214a49..ad95fdbf 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -60,7 +60,7 @@ class Spec2Pep(pl.LightningModule, ModelMixin): max_iters: int, optional Total number of iterations for learning rate scheduler output_path: str, optional - Path to write csv file with denovo peptide sequences + Path to write csv file with denovo peptide sequences **kwargs : Dict Keyword arguments passed to the Adam optimizer """ @@ -81,7 +81,7 @@ def __init__( tb_summarywriter=None, warmup_iters=100000, max_iters=600000, - output_path='', + output_path="", **kwargs, ): """Initialize a Spec2Pep model""" @@ -90,9 +90,9 @@ def __init__( # Writable self.max_length = max_length self.n_log = n_log - - self.residues = residues - + + self.residues = residues + # Build the model if custom_encoder is not None: if isinstance(custom_encoder, PairedSpectrumEncoder): @@ -126,12 +126,12 @@ def __init__( self._history = [] self.opt_kwargs = kwargs self.stop_token = self.decoder._aa2idx["$"] - + self.tb_summarywriter = tb_summarywriter - + self.warmup_iters = warmup_iters self.max_iters = max_iters - + # Store de novo sequences to be saved self.denovo_seqs = [] self.output_path = output_path @@ -292,8 +292,8 @@ def training_step(self, batch, *args): torch.Tensor The loss. """ - - spectra, precursors, sequences = batch + + spectra, precursors, sequences = batch pred, truth = self._step(spectra, precursors, sequences) pred = pred[:, :-1, :].reshape(-1, self.decoder.vocab_size + 1) @@ -336,55 +336,61 @@ def validation_step(self, batch, *args): on_epoch=True, sync_dist=True, ) - - #De novo sequence the batch + + # De novo sequence the batch pred_seqs, scores = self.predict_step(batch) - - #Temporary solution to predictions with multiple stop token '$', filter them out + + # Temporary solution to predictions with multiple stop token '$', filter them out filtered_pred_seqs = [] filtered_sequences = [] for i in range(len(pred_seqs)): if len(pred_seqs[i]) > 0: ps = pred_seqs[i] if pred_seqs[i][0] == "$": - ps = pred_seqs[i][1:] #Remove stop token - if "$" not in ps and len(ps)>0: + ps = pred_seqs[i][1:] # Remove stop token + if "$" not in ps and len(ps) > 0: filtered_pred_seqs += [ps] - filtered_sequences += [sequences[i]] - - #Find AA and peptide matches - all_aa_match, orig_total_num_aa, pred_total_num_aa = batch_aa_match(filtered_pred_seqs, filtered_sequences, self.decoder._peptide_mass.masses, 'best') - - #Calculate evaluation metrics based on matches - aa_precision, aa_recall, pep_recall = calc_eval_metrics(all_aa_match, orig_total_num_aa, pred_total_num_aa) - + filtered_sequences += [sequences[i]] + + # Find AA and peptide matches + all_aa_match, orig_total_num_aa, pred_total_num_aa = batch_aa_match( + filtered_pred_seqs, + filtered_sequences, + self.decoder._peptide_mass.masses, + "best", + ) + + # Calculate evaluation metrics based on matches + aa_precision, aa_recall, pep_recall = calc_eval_metrics( + all_aa_match, orig_total_num_aa, pred_total_num_aa + ) + self.log( "aa_precision", {"valid": aa_precision}, on_step=False, on_epoch=True, sync_dist=True, - ) - + ) + self.log( "aa_recall", {"valid": aa_recall}, on_step=False, on_epoch=True, sync_dist=True, - ) - + ) + self.log( "pep_recall", {"valid": pep_recall}, on_step=False, on_epoch=True, sync_dist=True, - ) - - + ) + return loss - + def test_step(self, batch, *args): """A single test step @@ -399,12 +405,11 @@ def test_step(self, batch, *args): (index 2) """ - #De novo sequence the batch + # De novo sequence the batch pred_seqs, scores = self.predict_step(batch) spectrum_order_id = batch[-1] - self.denovo_seqs += [(spectrum_order_id, pred_seqs,scores)] - - + self.denovo_seqs += [(spectrum_order_id, pred_seqs, scores)] + def on_train_epoch_end(self): """Log the training loss. @@ -412,7 +417,7 @@ def on_train_epoch_end(self): """ train_loss = self.trainer.callback_metrics["CELoss"]["train"].item() self._history[-1]["train"] = train_loss - + def on_validation_epoch_end(self): """Log the epoch metrics to self.history. @@ -421,57 +426,90 @@ def on_validation_epoch_end(self): metrics = { "epoch": self.trainer.current_epoch, "valid": self.trainer.callback_metrics["CELoss"]["valid"].item(), - "valid_aa_precision": self.trainer.callback_metrics["aa_precision"]["valid"].item(), - "valid_aa_recall": self.trainer.callback_metrics["aa_recall"]["valid"].item(), - "valid_pep_recall": self.trainer.callback_metrics["pep_recall"]["valid"].item(), + "valid_aa_precision": self.trainer.callback_metrics[ + "aa_precision" + ]["valid"].item(), + "valid_aa_recall": self.trainer.callback_metrics["aa_recall"][ + "valid" + ].item(), + "valid_pep_recall": self.trainer.callback_metrics["pep_recall"][ + "valid" + ].item(), } self._history.append(metrics) - + def on_test_epoch_end(self): """Write de novo sequences and confidence scores to csv file. This is a pytorch-lightning hook. """ - with open(os.path.join(str(self.output_path),'casanovo_output.csv'), 'w') as f: + with open( + os.path.join(str(self.output_path), "casanovo_output.csv"), "w" + ) as f: writer = csv.writer(f) - writer.writerow(['spectrum_id','denovo_seq','peptide_score','aa_scores']) - + writer.writerow( + ["spectrum_id", "denovo_seq", "peptide_score", "aa_scores"] + ) + for batch in self.denovo_seqs: - scores = batch[2].cpu() #transfer to cpu in case in gpu - + scores = batch[2].cpu() # transfer to cpu in case in gpu + for i in range(len(batch[0])): - top_scores = torch.max(scores[i],axis=1)[0] #take the score of most probable AA - empty_index = torch.where(top_scores==0.04)[0] #find the indices of positions after stop token - - if len(empty_index)>0:#check if decoding was stopped - last_index = empty_index[0]-1 #select index of the last AA - - if last_index >= 1: #check if peptide is at least one AA long - top_scores_list = top_scores[:last_index].tolist() #omit the stop token + top_scores = torch.max(scores[i], axis=1)[ + 0 + ] # take the score of most probable AA + empty_index = torch.where(top_scores == 0.04)[ + 0 + ] # find the indices of positions after stop token + + if len(empty_index) > 0: # check if decoding was stopped + last_index = ( + empty_index[0] - 1 + ) # select index of the last AA + + if ( + last_index >= 1 + ): # check if peptide is at least one AA long + top_scores_list = top_scores[ + :last_index + ].tolist() # omit the stop token peptide_score = np.mean(top_scores_list) aa_scores = list(reversed(top_scores_list)) - + else: peptide_score = None aa_scores = None - + else: peptide_score = None aa_scores = None - - writer.writerow([batch[0][i],batch[1][i][1:],peptide_score,aa_scores]) - + + writer.writerow( + [ + batch[0][i], + batch[1][i][1:], + peptide_score, + aa_scores, + ] + ) + def on_epoch_end(self): """Print log to console, if requested.""" - + if len(self._history) > 0: - #Print only if all output for the current epoch is recorded + # Print only if all output for the current epoch is recorded if len(self._history[-1]) == 6: if len(self._history) == 1: - LOGGER.info("---------------------------------------------------------------------------------------------------------") - LOGGER.info(" Epoch | Train Loss | Valid Loss | Valid AA precision | Valid AA recall | Valid Peptide recall ") - LOGGER.info("---------------------------------------------------------------------------------------------------------") - + LOGGER.info( + "---------------------------------------------------------------------------------------------------------" + ) + LOGGER.info( + " Epoch | Train Loss | Valid Loss | Valid AA precision | Valid AA recall | Valid Peptide recall " + ) + LOGGER.info( + "---------------------------------------------------------------------------------------------------------" + ) + metrics = self._history[-1] if not metrics["epoch"] % self.n_log: LOGGER.info( @@ -483,14 +521,34 @@ def on_epoch_end(self): metrics.get("valid_aa_recall", np.nan), metrics.get("valid_pep_recall", np.nan), ) - #Add metrics to SummaryWriter object if provided + # Add metrics to SummaryWriter object if provided if self.tb_summarywriter is not None: - self.tb_summarywriter.add_scalar('loss/train_crossentropy_loss', metrics.get("train", np.nan), metrics["epoch"]+1) - self.tb_summarywriter.add_scalar('loss/dev_crossentropy_loss', metrics.get("valid", np.nan), metrics["epoch"]+1) - - self.tb_summarywriter.add_scalar('eval/dev_aa_precision', metrics.get("valid_aa_precision", np.nan), metrics["epoch"]+1) - self.tb_summarywriter.add_scalar('eval/dev_aa_recall', metrics.get("valid_aa_recall", np.nan), metrics["epoch"]+1) - self.tb_summarywriter.add_scalar('eval/dev_pep_recall', metrics.get("valid_pep_recall", np.nan), metrics["epoch"]+1) + self.tb_summarywriter.add_scalar( + "loss/train_crossentropy_loss", + metrics.get("train", np.nan), + metrics["epoch"] + 1, + ) + self.tb_summarywriter.add_scalar( + "loss/dev_crossentropy_loss", + metrics.get("valid", np.nan), + metrics["epoch"] + 1, + ) + + self.tb_summarywriter.add_scalar( + "eval/dev_aa_precision", + metrics.get("valid_aa_precision", np.nan), + metrics["epoch"] + 1, + ) + self.tb_summarywriter.add_scalar( + "eval/dev_aa_recall", + metrics.get("valid_aa_recall", np.nan), + metrics["epoch"] + 1, + ) + self.tb_summarywriter.add_scalar( + "eval/dev_pep_recall", + metrics.get("valid_pep_recall", np.nan), + metrics["epoch"] + 1, + ) def configure_optimizers(self): """Initialize the optimizer. @@ -506,10 +564,13 @@ def configure_optimizers(self): optimizer = torch.optim.Adam(self.parameters(), **self.opt_kwargs) # Apply lr scheduler per step - lr_scheduler = CosineWarmupScheduler(optimizer, warmup=self.warmup_iters, max_iters=self.max_iters) - - return [optimizer], [{'scheduler': lr_scheduler, 'interval': 'step'}] - + lr_scheduler = CosineWarmupScheduler( + optimizer, warmup=self.warmup_iters, max_iters=self.max_iters + ) + + return [optimizer], [{"scheduler": lr_scheduler, "interval": "step"}] + + class CosineWarmupScheduler(torch.optim.lr_scheduler._LRScheduler): """Learning rate scheduler with linear warm up followed by cosine shaped decay. Parameters @@ -520,7 +581,7 @@ class CosineWarmupScheduler(torch.optim.lr_scheduler._LRScheduler): Number of warm up iterations max_iters : torch.optim Total number of iterations - """ + """ def __init__(self, optimizer, warmup, max_iters): self.warmup = warmup @@ -535,4 +596,4 @@ def get_lr_factor(self, epoch): lr_factor = 0.5 * (1 + np.cos(np.pi * epoch / self.max_num_iters)) if epoch <= self.warmup: lr_factor *= epoch * 1.0 / self.warmup - return lr_factor \ No newline at end of file + return lr_factor diff --git a/casanovo/denovo/train_test.py b/casanovo/denovo/train_test.py index 6ace2a58..a9deac96 100644 --- a/casanovo/denovo/train_test.py +++ b/casanovo/denovo/train_test.py @@ -7,240 +7,289 @@ from casanovo.denovo import DeNovoDataModule, Spec2Pep import yaml + def train(train_data_path, val_data_path, model_path, config): - """Train a Casanovo model with options specified in config.py.""" - - #Set random seed across PyTorch, numpy and python.random - pl.utilities.seed.seed_everything(seed=config['random_seed'], workers=True) + """Train a Casanovo model with options specified in config.py.""" + + # Set random seed across PyTorch, numpy and python.random + pl.utilities.seed.seed_everything(seed=config["random_seed"], workers=True) - #Index training and validation data + # Index training and validation data train_data_path = Path(train_data_path) if train_data_path.is_file(): - raise FileNotFoundError(f'train_data_path expects directory path but file path was provided instead') - train_mgf_files = [train_data_path / f for f in os.listdir(train_data_path) if (train_data_path/f).suffix.lower() == ".mgf"] + raise FileNotFoundError( + f"train_data_path expects directory path but file path was provided instead" + ) + train_mgf_files = [ + train_data_path / f + for f in os.listdir(train_data_path) + if (train_data_path / f).suffix.lower() == ".mgf" + ] val_data_path = Path(val_data_path) if val_data_path.is_file(): - raise FileNotFoundError(f'val_data_path expects directory path but file path was provided instead') - val_mgf_files = [val_data_path/f for f in os.listdir(val_data_path) if (val_data_path/f).suffix.lower() == ".mgf"] - - train_index = AnnotatedSpectrumIndex(os.path.join(os.getcwd(),config['train_annot_spec_idx_path']), train_mgf_files, overwrite=config['train_spec_idx_overwrite']) - val_index = AnnotatedSpectrumIndex(os.path.join(os.getcwd(),config['val_annot_spec_idx_path']), val_mgf_files, overwrite=config['val_spec_idx_overwrite']) + raise FileNotFoundError( + f"val_data_path expects directory path but file path was provided instead" + ) + val_mgf_files = [ + val_data_path / f + for f in os.listdir(val_data_path) + if (val_data_path / f).suffix.lower() == ".mgf" + ] - #Initialize data loaders + train_index = AnnotatedSpectrumIndex( + os.path.join(os.getcwd(), config["train_annot_spec_idx_path"]), + train_mgf_files, + overwrite=config["train_spec_idx_overwrite"], + ) + val_index = AnnotatedSpectrumIndex( + os.path.join(os.getcwd(), config["val_annot_spec_idx_path"]), + val_mgf_files, + overwrite=config["val_spec_idx_overwrite"], + ) + + # Initialize data loaders train_loader = DeNovoDataModule( train_index=train_index, - n_peaks=config['n_peaks'], - min_mz=config['min_mz'], - max_mz=config['max_mz'], - min_intensity=config['min_intensity'], - fragment_tol_mass=config['fragment_tol_mass'], - preprocess_spec=config['preprocess_spec'], - num_workers=config['num_workers'], - batch_size=config['train_batch_size'] + n_peaks=config["n_peaks"], + min_mz=config["min_mz"], + max_mz=config["max_mz"], + min_intensity=config["min_intensity"], + fragment_tol_mass=config["fragment_tol_mass"], + preprocess_spec=config["preprocess_spec"], + num_workers=config["num_workers"], + batch_size=config["train_batch_size"], ) val_loader = DeNovoDataModule( valid_index=val_index, - n_peaks=config['n_peaks'], - min_mz=config['min_mz'], - max_mz=config['max_mz'], - min_intensity=config['min_intensity'], - fragment_tol_mass=config['fragment_tol_mass'], - preprocess_spec=config['preprocess_spec'], - num_workers=config['num_workers'], - batch_size=config['val_batch_size'] + n_peaks=config["n_peaks"], + min_mz=config["min_mz"], + max_mz=config["max_mz"], + min_intensity=config["min_intensity"], + fragment_tol_mass=config["fragment_tol_mass"], + preprocess_spec=config["preprocess_spec"], + num_workers=config["num_workers"], + batch_size=config["val_batch_size"], ) train_loader.setup() val_loader.setup() # Initialize the model - if config['train_from_scratch'] == True: + if config["train_from_scratch"] == True: model = Spec2Pep( - dim_model=config['dim_model'], - n_head=config['n_head'], - dim_feedforward=config['dim_feedforward'], - n_layers=config['n_layers'], - dropout=config['dropout'], - dim_intensity=config['dim_intensity'], - custom_encoder=config['custom_encoder'], - max_length=config['max_length'], - residues=config['residues'], - max_charge=config['max_charge'], - n_log=config['n_log'], - tb_summarywriter=config['tb_summarywriter'], - warmup_iters = config['warmup_iters'], - max_iters = config['max_iters'], - lr=config['learning_rate'], - weight_decay=config['weight_decay'] - ) + dim_model=config["dim_model"], + n_head=config["n_head"], + dim_feedforward=config["dim_feedforward"], + n_layers=config["n_layers"], + dropout=config["dropout"], + dim_intensity=config["dim_intensity"], + custom_encoder=config["custom_encoder"], + max_length=config["max_length"], + residues=config["residues"], + max_charge=config["max_charge"], + n_log=config["n_log"], + tb_summarywriter=config["tb_summarywriter"], + warmup_iters=config["warmup_iters"], + max_iters=config["max_iters"], + lr=config["learning_rate"], + weight_decay=config["weight_decay"], + ) else: model_path = Path(model_path) if model_path.is_dir(): - raise FileNotFoundError(f'model_path expects file path but directory path was provided instead') + raise FileNotFoundError( + f"model_path expects file path but directory path was provided instead" + ) model = Spec2Pep().load_from_checkpoint( - model_path, - dim_model=config['dim_model'], - n_head=config['n_head'], - dim_feedforward=config['dim_feedforward'], - n_layers=config['n_layers'], - dropout=config['dropout'], - dim_intensity=config['dim_intensity'], - custom_encoder=config['custom_encoder'], - max_length=config['max_length'], - residues=config['residues'], - max_charge=config['max_charge'], - n_log=config['n_log'], - tb_summarywriter=config['tb_summarywriter'], - warmup_iters = config['warmup_iters'], - max_iters = config['max_iters'], - lr=config['learning_rate'], - weight_decay=config['weight_decay'] - ) - - + model_path, + dim_model=config["dim_model"], + n_head=config["n_head"], + dim_feedforward=config["dim_feedforward"], + n_layers=config["n_layers"], + dropout=config["dropout"], + dim_intensity=config["dim_intensity"], + custom_encoder=config["custom_encoder"], + max_length=config["max_length"], + residues=config["residues"], + max_charge=config["max_charge"], + n_log=config["n_log"], + tb_summarywriter=config["tb_summarywriter"], + warmup_iters=config["warmup_iters"], + max_iters=config["max_iters"], + lr=config["learning_rate"], + weight_decay=config["weight_decay"], + ) - #Create Trainer object and checkpoint callback to save model - if config['save_model'] == True: + # Create Trainer object and checkpoint callback to save model + if config["save_model"] == True: checkpoint_callback = pl.callbacks.ModelCheckpoint( - dirpath=config['model_save_folder_path'], - save_weights_only=config['save_weights_only'], - filename='{epoch}', - every_n_epochs=config['every_n_epochs'], - save_top_k=-1 + dirpath=config["model_save_folder_path"], + save_weights_only=config["save_weights_only"], + filename="{epoch}", + every_n_epochs=config["every_n_epochs"], + save_top_k=-1, ) trainer = pl.Trainer( - accelerator=config['accelerator'], - logger=config['logger'], - gpus=config['gpus'], - max_epochs=config['max_epochs'], - num_sanity_val_steps=config['num_sanity_val_steps'], - callbacks=[checkpoint_callback] - ) + accelerator=config["accelerator"], + logger=config["logger"], + gpus=config["gpus"], + max_epochs=config["max_epochs"], + num_sanity_val_steps=config["num_sanity_val_steps"], + callbacks=[checkpoint_callback], + ) else: trainer = pl.Trainer( - accelerator=config['accelerator'], - logger=config['logger'], - gpus=config['gpus'], - max_epochs=config['max_epochs'], - num_sanity_val_steps=config['num_sanity_val_steps'] + accelerator=config["accelerator"], + logger=config["logger"], + gpus=config["gpus"], + max_epochs=config["max_epochs"], + num_sanity_val_steps=config["num_sanity_val_steps"], ) - - #Train the model - trainer.fit(model, train_loader.train_dataloader(), val_loader.val_dataloader()) + + # Train the model + trainer.fit( + model, train_loader.train_dataloader(), val_loader.val_dataloader() + ) + def evaluate(test_data_path, model_path, config): """Run inference a pre-trained Casanovo model with evaluation and using options specified in config.py.""" - + # Initialize the pre-trained model model_path = Path(model_path) if model_path.is_dir(): - raise FileNotFoundError(f'model_path expects file path but directory path was provided instead') + raise FileNotFoundError( + f"model_path expects file path but directory path was provided instead" + ) model_trained = Spec2Pep().load_from_checkpoint( - model_path, - dim_model=config['dim_model'], - n_head=config['n_head'], - dim_feedforward=config['dim_feedforward'], - n_layers=config['n_layers'], - dropout=config['dropout'], - dim_intensity=config['dim_intensity'], - custom_encoder=config['custom_encoder'], - max_length=config['max_length'], - residues=config['residues'], - max_charge=config['max_charge'], - n_log=config['n_log'], -) - #Index test data + model_path, + dim_model=config["dim_model"], + n_head=config["n_head"], + dim_feedforward=config["dim_feedforward"], + n_layers=config["n_layers"], + dropout=config["dropout"], + dim_intensity=config["dim_intensity"], + custom_encoder=config["custom_encoder"], + max_length=config["max_length"], + residues=config["residues"], + max_charge=config["max_charge"], + n_log=config["n_log"], + ) + # Index test data test_data_path = Path(test_data_path) if test_data_path.is_file(): - raise FileNotFoundError(f'test_data_path expects directory path but file path was provided instead') - mgf_files = [test_data_path/f for f in os.listdir(test_data_path) if (test_data_path/f).suffix.lower() == ".mgf"] - index = AnnotatedSpectrumIndex(os.path.join(os.getcwd(),config['test_annot_spec_idx_path']), mgf_files, overwrite=config['test_spec_idx_overwrite']) - - #Initialize the data loader - loaders=DeNovoDataModule( + raise FileNotFoundError( + f"test_data_path expects directory path but file path was provided instead" + ) + mgf_files = [ + test_data_path / f + for f in os.listdir(test_data_path) + if (test_data_path / f).suffix.lower() == ".mgf" + ] + index = AnnotatedSpectrumIndex( + os.path.join(os.getcwd(), config["test_annot_spec_idx_path"]), + mgf_files, + overwrite=config["test_spec_idx_overwrite"], + ) + + # Initialize the data loader + loaders = DeNovoDataModule( test_index=index, - n_peaks=config['n_peaks'], - min_mz=config['min_mz'], - max_mz=config['max_mz'], - min_intensity=config['min_intensity'], - fragment_tol_mass=config['fragment_tol_mass'], - preprocess_spec=config['preprocess_spec'], - num_workers=config['num_workers'], - batch_size=config['test_batch_size'] + n_peaks=config["n_peaks"], + min_mz=config["min_mz"], + max_mz=config["max_mz"], + min_intensity=config["min_intensity"], + fragment_tol_mass=config["fragment_tol_mass"], + preprocess_spec=config["preprocess_spec"], + num_workers=config["num_workers"], + batch_size=config["test_batch_size"], ) - - loaders.setup(stage='test', annotated=True) - #Create Trainer object + loaders.setup(stage="test", annotated=True) + + # Create Trainer object trainer = pl.Trainer( - accelerator=config['accelerator'], - logger=config['logger'], - gpus=config['gpus'], - max_epochs=config['max_epochs'], - num_sanity_val_steps=config['num_sanity_val_steps'] + accelerator=config["accelerator"], + logger=config["logger"], + gpus=config["gpus"], + max_epochs=config["max_epochs"], + num_sanity_val_steps=config["num_sanity_val_steps"], ) - - #Run test + + # Run test trainer.validate(model_trained, loaders.test_dataloader()) - + + def denovo(test_data_path, model_path, config, output_path): """Run inference with a pre-trained Casanovo model without evaluation and using options specified in config.py.""" # Initialize the pre-trained model model_path = Path(model_path) if model_path.is_dir(): - raise FileNotFoundError(f'model_path expects file path but directory path was provided instead') + raise FileNotFoundError( + f"model_path expects file path but directory path was provided instead" + ) model_trained = Spec2Pep().load_from_checkpoint( - model_path, - dim_model=config['dim_model'], - n_head=config['n_head'], - dim_feedforward=config['dim_feedforward'], - n_layers=config['n_layers'], - dropout=config['dropout'], - dim_intensity=config['dim_intensity'], - custom_encoder=config['custom_encoder'], - max_length=config['max_length'], - residues=config['residues'], - max_charge=config['max_charge'], - n_log=config['n_log'], - output_path=output_path -) - #Index test data + model_path, + dim_model=config["dim_model"], + n_head=config["n_head"], + dim_feedforward=config["dim_feedforward"], + n_layers=config["n_layers"], + dropout=config["dropout"], + dim_intensity=config["dim_intensity"], + custom_encoder=config["custom_encoder"], + max_length=config["max_length"], + residues=config["residues"], + max_charge=config["max_charge"], + n_log=config["n_log"], + output_path=output_path, + ) + # Index test data test_data_path = Path(test_data_path) if test_data_path.is_file(): - raise FileNotFoundError(f'test_data_path expects directory path but file path was provided instead') - mgf_files = [test_data_path/f for f in os.listdir(test_data_path) if (test_data_path/f).suffix.lower() == ".mgf"] - index = SpectrumIndex(os.path.join(os.getcwd(),config['test_annot_spec_idx_path']), mgf_files, overwrite=config['test_spec_idx_overwrite']) + raise FileNotFoundError( + f"test_data_path expects directory path but file path was provided instead" + ) + mgf_files = [ + test_data_path / f + for f in os.listdir(test_data_path) + if (test_data_path / f).suffix.lower() == ".mgf" + ] + index = SpectrumIndex( + os.path.join(os.getcwd(), config["test_annot_spec_idx_path"]), + mgf_files, + overwrite=config["test_spec_idx_overwrite"], + ) - #Initialize the data loader - loaders=DeNovoDataModule( + # Initialize the data loader + loaders = DeNovoDataModule( test_index=index, - n_peaks=config['n_peaks'], - min_mz=config['min_mz'], - max_mz=config['max_mz'], - min_intensity=config['min_intensity'], - fragment_tol_mass=config['fragment_tol_mass'], - preprocess_spec=config['preprocess_spec'], - num_workers=config['num_workers'], - batch_size=config['test_batch_size'] + n_peaks=config["n_peaks"], + min_mz=config["min_mz"], + max_mz=config["max_mz"], + min_intensity=config["min_intensity"], + fragment_tol_mass=config["fragment_tol_mass"], + preprocess_spec=config["preprocess_spec"], + num_workers=config["num_workers"], + batch_size=config["test_batch_size"], ) - - loaders.setup(stage='test', annotated=False) - #Create Trainer object + loaders.setup(stage="test", annotated=False) + + # Create Trainer object trainer = pl.Trainer( - accelerator=config['accelerator'], - logger=config['logger'], - gpus=config['gpus'], - max_epochs=config['max_epochs'], - num_sanity_val_steps=config['num_sanity_val_steps'] + accelerator=config["accelerator"], + logger=config["logger"], + gpus=config["gpus"], + max_epochs=config["max_epochs"], + num_sanity_val_steps=config["num_sanity_val_steps"], ) - - #Run model without evaluation + + # Run model without evaluation trainer.test(model_trained, loaders.test_dataloader()) diff --git a/tests/conftest.py b/tests/conftest.py index 2145e977..7fc1ac33 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,6 +20,7 @@ # Pytest fixtures added here can be used as parameters for testing methods + @pytest.fixture def mgf_small(tmp_path): """An MGF file with 2 annotated spectra.""" @@ -27,7 +28,8 @@ def mgf_small(tmp_path): mgf_file = tmp_path / "small.mgf" return _create_mgf(peptides, mgf_file) -#Utility Funcs + +# Utility Funcs def _create_mgf(peptides, mgf_file, random_state=42): """Create a fake MGF file from one or more peptides. Parameters @@ -49,6 +51,7 @@ def _create_mgf(peptides, mgf_file, random_state=42): return mgf_file + def _create_mgf_entry(peptide, charge=2): """Create a MassIVE-KB style MGF entry for a single PSM. Parameters @@ -88,6 +91,7 @@ def _create_mgf_entry(peptide, charge=2): ] return "\n".join(mgf) + # Set the PPX_DATA_DIRECTORY -------------------------------------------------- @pytest.fixture(autouse=True) def ppx_data_dir(monkeypatch, tmp_path): diff --git a/tests/system_tests/test_program.py b/tests/system_tests/test_program.py index 139a3564..937f30c6 100644 --- a/tests/system_tests/test_program.py +++ b/tests/system_tests/test_program.py @@ -6,69 +6,91 @@ import pytorch_lightning as pl from os.path import isfile + def test_denovo_casanovo(mgf_small): """Load up a basic config""" - abs_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../casanovo/config.yaml') + abs_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "../../casanovo/config.yaml", + ) with open(abs_path) as f: config = yaml.safe_load(f) """Ensure that the test doesn't raise any system overdraw issues in terms of resources""" - config['gpus'] = [] - config['num_workers'] = 0 - output_path=os.path.dirname(os.path.abspath(__file__)) + config["gpus"] = [] + config["num_workers"] = 0 + output_path = os.path.dirname(os.path.abspath(__file__)) """Running a small mgf through an untrained model just to test that the model output is valid in terms of format""" tabula_rasa_model = Spec2Pep( - dim_model=config['dim_model'], - n_head=config['n_head'], - dim_feedforward=config['dim_feedforward'], - n_layers=config['n_layers'], - dropout=config['dropout'], - dim_intensity=config['dim_intensity'], - custom_encoder=config['custom_encoder'], - max_length=config['max_length'], - residues=config['residues'], - max_charge=config['max_charge'], - n_log=config['n_log'], - tb_summarywriter=config['tb_summarywriter'], - warmup_iters = config['warmup_iters'], - max_iters = config['max_iters'], - lr=config['learning_rate'], - weight_decay=config['weight_decay'], - output_path=output_path - ) + dim_model=config["dim_model"], + n_head=config["n_head"], + dim_feedforward=config["dim_feedforward"], + n_layers=config["n_layers"], + dropout=config["dropout"], + dim_intensity=config["dim_intensity"], + custom_encoder=config["custom_encoder"], + max_length=config["max_length"], + residues=config["residues"], + max_charge=config["max_charge"], + n_log=config["n_log"], + tb_summarywriter=config["tb_summarywriter"], + warmup_iters=config["warmup_iters"], + max_iters=config["max_iters"], + lr=config["learning_rate"], + weight_decay=config["weight_decay"], + output_path=output_path, + ) from pathlib import Path + mgf_small = Path(mgf_small).parents[0] - mgf_files = [mgf_small/f for f in os.listdir(mgf_small) if (mgf_small/f).suffix.lower() == ".mgf"] - index = SpectrumIndex(os.path.join(os.getcwd(),config['test_annot_spec_idx_path']), mgf_files, overwrite=config['test_spec_idx_overwrite']) + mgf_files = [ + mgf_small / f + for f in os.listdir(mgf_small) + if (mgf_small / f).suffix.lower() == ".mgf" + ] + index = SpectrumIndex( + os.path.join(os.getcwd(), config["test_annot_spec_idx_path"]), + mgf_files, + overwrite=config["test_spec_idx_overwrite"], + ) - loaders=DeNovoDataModule( + loaders = DeNovoDataModule( test_index=index, - n_peaks=config['n_peaks'], - min_mz=config['min_mz'], - max_mz=config['max_mz'], - min_intensity=config['min_intensity'], - fragment_tol_mass=config['fragment_tol_mass'], - preprocess_spec=config['preprocess_spec'], - num_workers=config['num_workers'], - batch_size=config['test_batch_size'] + n_peaks=config["n_peaks"], + min_mz=config["min_mz"], + max_mz=config["max_mz"], + min_intensity=config["min_intensity"], + fragment_tol_mass=config["fragment_tol_mass"], + preprocess_spec=config["preprocess_spec"], + num_workers=config["num_workers"], + batch_size=config["test_batch_size"], ) - - loaders.setup(stage='test', annotated=False) - #Create Trainer object + loaders.setup(stage="test", annotated=False) + + # Create Trainer object trainer = pl.Trainer( - accelerator=config['accelerator'], - logger=config['logger'], - gpus=config['gpus'], - max_epochs=config['max_epochs'], - num_sanity_val_steps=config['num_sanity_val_steps'] + accelerator=config["accelerator"], + logger=config["logger"], + gpus=config["gpus"], + max_epochs=config["max_epochs"], + num_sanity_val_steps=config["num_sanity_val_steps"], ) trainer.test(tabula_rasa_model, loaders.test_dataloader()) """If the output exists then the test has passed""" - if(isfile(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'casanovo_output.csv'))): - os.remove(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'casanovo_output.csv')) - assert True \ No newline at end of file + if isfile( + os.path.join( + os.path.dirname(os.path.abspath(__file__)), "casanovo_output.csv" + ) + ): + os.remove( + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "casanovo_output.csv", + ) + ) + assert True diff --git a/tests/unit_tests/test_transformer.py b/tests/unit_tests/test_transformer.py index 9007a281..a158826f 100644 --- a/tests/unit_tests/test_transformer.py +++ b/tests/unit_tests/test_transformer.py @@ -7,6 +7,7 @@ ) from depthcharge.masses import PeptideMass + def test_spectrum_encoder(): spectra = torch.tensor( [ @@ -15,24 +16,24 @@ def test_spectrum_encoder(): ] ) model = SpectrumEncoder( - dim_model=128, - n_head=8, + dim_model=128, + n_head=8, dim_feedforward=1024, - n_layers=1, - dropout=0, - dim_intensity=None + n_layers=1, + dropout=0, + dim_intensity=None, ) emb, mask = model(spectra) assert emb.shape == (2, 4, 128) assert mask.sum() == 1 model = SpectrumEncoder( - dim_model=128, - n_head=8, + dim_model=128, + n_head=8, dim_feedforward=1024, - n_layers=1, - dropout=0, - dim_intensity=4 + n_layers=1, + dropout=0, + dim_intensity=4, ) emb, mask = model(spectra) assert emb.shape == (2, 4, 128) @@ -48,6 +49,7 @@ def test_spectrum_encoder(): assert emb.shape == (2, 4, 8) assert mask.sum() == 1 + def test_peptide_decoder(): spectra = torch.tensor( [ @@ -67,14 +69,14 @@ def test_peptide_decoder(): max_charge=5, ) enc_model = SpectrumEncoder( - dim_model=128, - n_head=8, + dim_model=128, + n_head=8, dim_feedforward=1024, - n_layers=1, - dropout=0, - dim_intensity=None + n_layers=1, + dropout=0, + dim_intensity=None, ) memory, mem_mask = enc_model(spectra) scores, tokens = dec_model(peptides, precursors, memory, mem_mask) assert scores.shape == (2, 10, dec_model.vocab_size + 1) - assert tokens.shape == (2, 9) \ No newline at end of file + assert tokens.shape == (2, 9)