From a64ac4098414eba5a8ff5ae92997bf1b4c27516e Mon Sep 17 00:00:00 2001 From: Michael Lazear Date: Tue, 15 Aug 2023 15:57:40 -0700 Subject: [PATCH] Update docs, fix LFQ-parquet, release v0.14.0 --- CHANGELOG.md | 6 +- Cargo.toml | 4 +- DOCS.md | 4 +- README.md | 18 +-- crates/sage-cloudpath/src/parquet.rs | 201 ++++++++++++++------------- 5 files changed, 122 insertions(+), 111 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 098aa7e..e5d7548 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,11 +4,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [v0.14.0] ### Added -- Support for parquet file format output. Search results and reporter ion quantification will be written to one file (`results.sage.parquet`) and label-free quant will be written to another (`lfq.parquet`) +- Support for parquet file format output. Search results and reporter ion quantification will be written to one file (`results.sage.parquet`) and label-free quant will be written to another (`lfq.parquet`). Parquet files tend to be significantly smaller than TSV files, faster to parse, and are compatible with a variety of distributed SQL engines. ### Changed -- Implement heapselect algorithm for faster sorting of candidate matches (#80) +- Implement heapselect algorithm for faster sorting of candidate matches (#80). This is a backwards-incompatible change with respect to output - small changes in PSM ranks will be present between v0.13.4 and v0.14.0 ## [v0.13.4] ### Fixed diff --git a/Cargo.toml b/Cargo.toml index bec6638..3e01cec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,6 @@ members = [ ] [profile.release] -#lto = "fat" -#codegen-units = 1 +lto = "fat" +codegen-units = 1 panic = "abort" \ No newline at end of file diff --git a/DOCS.md b/DOCS.md index 9ba326d..1a39b2b 100644 --- a/DOCS.md +++ b/DOCS.md @@ -240,7 +240,7 @@ The "results.sage.tsv" file contains the following columns (headers): - `spectrum_q`: Assigned spectrum-level q-value. - `peptide_q`: Assigned peptide-level q-value. - `protein_q`: Assigned protein-level q-value. -- `ms1_intensity`: Intensity of the MS1 precursor ion +- `ms1_intensity`: Intensity of the selected MS1 precursor ion (not label-free quant) - `ms2_intensity`: Total intensity of MS2 spectrum -These columns provide comprehensive information about each candidate peptide spectrum match (PSM) identified by the Sage search engine, enabling users to assess the quality and characteristics of the results. \ No newline at end of file +These columns provide comprehensive information about each candidate peptide spectrum match (PSM) identified by the Sage search engine. \ No newline at end of file diff --git a/README.md b/README.md index c63abde..f1057ba 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,10 @@ Check out the [blog post introducing Sage](https://lazear.github.io/sage/) for m - Incredible performance out of the box - Effortlessly cross-platform (Linux/MacOS/Windows), effortlessly parallel (uses all of your CPU cores) - Fragment indexing strategy allows for blazing fast narrow and open searches (> 500 Da precursor tolerance) -- MS3-TMT quantification (R-squared of 0.999 with Proteome Discoverer) +- Isobaric quantification (MS2/MS3-TMT, or custom reporter ions) +- Label-free quantification: consider all charge states & isotopologues *a la* FlashLFQ - Capable of searching for chimeric/co-fragmenting spectra +- Wide-window (dynamic precursor tolerance) search mode - enables WWA/PRM/DIA searches - Retention time prediction models fit to each LC/MS run - PSM rescoring using built-in linear discriminant analysis (LDA) - PEP calculation using a non-parametric model (KDE) @@ -33,15 +35,11 @@ Check out the [blog post introducing Sage](https://lazear.github.io/sage/) for m - Built-in support for reading gzipped-mzML files - Support for reading/writing directly from AWS S3 -### Experimental features - -- Label-free quantification: consider all charge states & isotopologues *a la* FlashLFQ - ### Assign multiple peptides to complex spectra -- When chimeric searching is turned on, 2 peptide identifications will be reported for each MS2 scan, both with `rank=1` +- When chimeric searching is enabled, multiple peptide identifications can be reported for each MS2 scan ### Sage trains machine learning models for FDR refinement and posterior error probability calculation @@ -113,6 +111,8 @@ Options: Path where search and quant results will be written. Overrides the directory specified in the configuration file. --batch-size Number of files to search in parallel (default = number of CPUs/2) + --parquet + Write parquet files instead of tab-separated files --write-pin Write percolator-compatible `.pin` output files -h, --help @@ -127,7 +127,7 @@ Example usage: `sage config.json` Some options in the parameters file can be over-written using the command line interface. These are: -1. The paths to the raw mzML data +1. The paths to the mzML data 2. The path to the database (fasta file) 3. The output directory @@ -149,12 +149,14 @@ Running Sage will produce several output files (located in either the current di - MS2 search results will be stored as a tab-separated file (`results.sage.tsv`) file - this is a tab-separated file, which can be opened in Excel/Pandas/etc - MS2 and MS3 quantitation results will be stored as a tab-separated file (`tmt.tsv`, `lfq.tsv`) if `quant.tmt` or `quant.lfq` options are used in the parameter file +If `--parquet` is passed as a command line argument, `results.sage.parquet` (and optionally, `lfq.parquet`) will be written. These have a similar set of columns, but TMT values are stored as a nested array alongside PSM features + ## Configuration file schema ### Notes - The majority of parameters are optional - only "database.fasta", "precursor_tol", and "fragment_tol" are required. Sage will try and use reasonable defaults for any parameters not supplied -- Tolerances are specified on the *experimental* m/z values. To perform a -100 to +500 Da open search (mass window applied to *precursor*), you would use `"da": [-500, 100]` +- Tolerances are specified on the *experimental* m/z values. To perform a -100 to +500 Da open search (mass window applied to *theoretical*), you would use `"da": [-500, 100]` ### Decoys diff --git a/crates/sage-cloudpath/src/parquet.rs b/crates/sage-cloudpath/src/parquet.rs index 6421bea..9deb294 100644 --- a/crates/sage-cloudpath/src/parquet.rs +++ b/crates/sage-cloudpath/src/parquet.rs @@ -34,7 +34,7 @@ pub fn build_schema() -> Result { required byte_array proteins (utf8); required int32 num_proteins; required int32 rank; - required int32 label; + required boolean is_decoy; required float expmass; required float calcmass; required int32 charge; @@ -131,97 +131,99 @@ pub fn serialize_features( let buf = Vec::new(); let mut writer = SerializedFileWriter::new(buf, schema.into(), options.into())?; - let mut rg = writer.next_row_group()?; - macro_rules! write_col { - ($field:ident, $ty:ident) => { - if let Some(mut col) = rg.next_column()? { - col.typed::<$ty>().write_batch( - &features - .iter() - .map(|f| f.$field as <$ty as DataType>::T) - .collect::>(), - None, - None, - )?; - col.close()?; - } - }; - ($lambda:expr, $ty:ident) => { - if let Some(mut col) = rg.next_column()? { - col.typed::<$ty>().write_batch( - &features.iter().map($lambda).collect::>(), - None, - None, - )?; - col.close()?; - } - }; - } + for features in features.chunks(65536) { + let mut rg = writer.next_row_group()?; + macro_rules! write_col { + ($field:ident, $ty:ident) => { + if let Some(mut col) = rg.next_column()? { + col.typed::<$ty>().write_batch( + &features + .iter() + .map(|f| f.$field as <$ty as DataType>::T) + .collect::>(), + None, + None, + )?; + col.close()?; + } + }; + ($lambda:expr, $ty:ident) => { + if let Some(mut col) = rg.next_column()? { + col.typed::<$ty>().write_batch( + &features.iter().map($lambda).collect::>(), + None, + None, + )?; + col.close()?; + } + }; + } - write_col!( - |f: &Feature| filenames[f.file_id].as_str().into(), - ByteArrayType - ); - write_col!(|f: &Feature| f.spec_id.as_str().into(), ByteArrayType); - write_col!( - |f: &Feature| database[f.peptide_idx].to_string().as_bytes().into(), - ByteArrayType - ); - write_col!( - |f: &Feature| database[f.peptide_idx].sequence.as_ref().into(), - ByteArrayType - ); - write_col!( - |f: &Feature| database[f.peptide_idx] - .proteins(&database.decoy_tag, database.generate_decoys) - .as_str() - .into(), - ByteArrayType - ); - write_col!( - |f: &Feature| database[f.peptide_idx].proteins.len() as i32, - Int32Type - ); - write_col!(rank, Int32Type); - write_col!(label, Int32Type); - write_col!(expmass, FloatType); - write_col!(calcmass, FloatType); - write_col!(charge, Int32Type); - write_col!(peptide_len, Int32Type); - write_col!(missed_cleavages, Int32Type); - write_col!(isotope_error, FloatType); - write_col!(delta_mass, FloatType); - write_col!(average_ppm, FloatType); - write_col!(hyperscore, FloatType); - write_col!(delta_next, FloatType); - write_col!(delta_best, FloatType); - write_col!(rt, FloatType); - write_col!(aligned_rt, FloatType); - write_col!(predicted_rt, FloatType); - write_col!(delta_rt_model, FloatType); - write_col!(matched_peaks, Int32Type); - write_col!(longest_b, Int32Type); - write_col!(longest_y, Int32Type); - write_col!(longest_y_pct, FloatType); - write_col!(matched_intensity_pct, FloatType); - write_col!(scored_candidates, Int32Type); - write_col!(poisson, FloatType); - write_col!(discriminant_score, FloatType); - write_col!(posterior_error, FloatType); - write_col!(spectrum_q, FloatType); - write_col!(peptide_q, FloatType); - write_col!(protein_q, FloatType); - - if let Some(col) = rg.next_column()? { - if reporter_ions.is_empty() { - write_null_column(col, features.len())?; - } else { - write_reporter_ions(col, features, reporter_ions)?; + write_col!( + |f: &Feature| filenames[f.file_id].as_str().into(), + ByteArrayType + ); + write_col!(|f: &Feature| f.spec_id.as_str().into(), ByteArrayType); + write_col!( + |f: &Feature| database[f.peptide_idx].to_string().as_bytes().into(), + ByteArrayType + ); + write_col!( + |f: &Feature| database[f.peptide_idx].sequence.as_ref().into(), + ByteArrayType + ); + write_col!( + |f: &Feature| database[f.peptide_idx] + .proteins(&database.decoy_tag, database.generate_decoys) + .as_str() + .into(), + ByteArrayType + ); + write_col!( + |f: &Feature| database[f.peptide_idx].proteins.len() as i32, + Int32Type + ); + write_col!(rank, Int32Type); + write_col!(|f: &Feature| f.label == -1, BoolType); + write_col!(expmass, FloatType); + write_col!(calcmass, FloatType); + write_col!(charge, Int32Type); + write_col!(peptide_len, Int32Type); + write_col!(missed_cleavages, Int32Type); + write_col!(isotope_error, FloatType); + write_col!(delta_mass, FloatType); + write_col!(average_ppm, FloatType); + write_col!(hyperscore, FloatType); + write_col!(delta_next, FloatType); + write_col!(delta_best, FloatType); + write_col!(rt, FloatType); + write_col!(aligned_rt, FloatType); + write_col!(predicted_rt, FloatType); + write_col!(delta_rt_model, FloatType); + write_col!(matched_peaks, Int32Type); + write_col!(longest_b, Int32Type); + write_col!(longest_y, Int32Type); + write_col!(longest_y_pct, FloatType); + write_col!(matched_intensity_pct, FloatType); + write_col!(scored_candidates, Int32Type); + write_col!(poisson, FloatType); + write_col!(discriminant_score, FloatType); + write_col!(posterior_error, FloatType); + write_col!(spectrum_q, FloatType); + write_col!(peptide_q, FloatType); + write_col!(protein_q, FloatType); + + if let Some(col) = rg.next_column()? { + if reporter_ions.is_empty() { + write_null_column(col, features.len())?; + } else { + write_reporter_ions(col, features, reporter_ions)?; + } } - } - rg.close()?; + rg.close()?; + } writer.into_inner() } @@ -231,7 +233,7 @@ pub fn build_lfq_schema() -> parquet::errors::Result { required byte_array peptide (utf8); required byte_array stripped_peptide (utf8); required byte_array proteins (utf8); - required boolean decoy; + required boolean is_decoy; required float q_value; required byte_array filename (utf8); required float intensity; @@ -258,7 +260,10 @@ pub fn serialize_lfq( if let Some(mut col) = rg.next_column()? { let values = areas .iter() - .map(|((peptide_idx, _), _)| database[*peptide_idx].to_string().as_bytes().into()) + .flat_map(|((peptide_idx, _), _)| { + let val = database[*peptide_idx].to_string().as_bytes().into(); + std::iter::repeat(val).take(filenames.len()) + }) .collect::>(); col.typed::() @@ -269,7 +274,10 @@ pub fn serialize_lfq( if let Some(mut col) = rg.next_column()? { let values = areas .iter() - .map(|((peptide_idx, _), _)| database[*peptide_idx].sequence.as_ref().into()) + .flat_map(|((peptide_idx, _), _)| { + let val = database[*peptide_idx].sequence.as_ref().into(); + std::iter::repeat(val).take(filenames.len()) + }) .collect::>(); col.typed::() @@ -280,11 +288,12 @@ pub fn serialize_lfq( if let Some(mut col) = rg.next_column()? { let values = areas .iter() - .map(|((peptide_idx, _), _)| { - database[*peptide_idx] + .flat_map(|((peptide_idx, _), _)| { + let val = database[*peptide_idx] .proteins(&database.decoy_tag, database.generate_decoys) .as_str() - .into() + .into(); + std::iter::repeat(val).take(filenames.len()) }) .collect::>(); @@ -296,7 +305,7 @@ pub fn serialize_lfq( if let Some(mut col) = rg.next_column()? { let values = areas .iter() - .map(|((_, decoy), _)| *decoy) + .flat_map(|((_, decoy), _)| std::iter::repeat(*decoy).take(filenames.len())) .collect::>(); col.typed::().write_batch(&values, None, None)?; @@ -306,7 +315,7 @@ pub fn serialize_lfq( if let Some(mut col) = rg.next_column()? { let values = areas .iter() - .map(|((_, _), (peak, _))| peak.q_value) + .flat_map(|((_, _), (peak, _))| std::iter::repeat(peak.q_value).take(filenames.len())) .collect::>(); col.typed::().write_batch(&values, None, None)?;