diff --git a/.gitignore b/.gitignore index dd27331..76a75fb 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ interest.txt latest.gmt ref.txt target_symbols.txt +/*.gmt +/*.rnk diff --git a/README.md b/README.md index 8eec440..613612e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Rust](https://github.com/bzhanglab/webgestalt_rust/actions/workflows/rust.yml/badge.svg?branch=master)](https://github.com/bzhanglab/webgestalt_rust/actions/workflows/rust.yml) -Rust implementation of [WebGestaltR](https://github.com/bzhanglab/webgestaltr). +Rust implementation of [WebGestaltR](https://github.com/bzhanglab/webgestaltr). ## Install diff --git a/src/main.rs b/src/main.rs index 6fb133a..febb099 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,8 +5,11 @@ use owo_colors::{OwoColorize, Stream::Stdout, Style}; use std::io::{BufReader, Write}; use std::{fs::File, time::Instant}; use webgestalt_lib::methods::gsea::GSEAConfig; +use webgestalt_lib::methods::multiomics::{combine_gmts, MultiOmicsMethod, NormalizationMethod}; use webgestalt_lib::methods::ora::ORAConfig; -use webgestalt_lib::readers::read_rank_file; +use webgestalt_lib::readers::utils::Item; +use webgestalt_lib::readers::{read_gmt_file, read_rank_file}; +use webgestalt_lib::{MalformedError, WebGestaltError}; /// WebGestalt CLI. /// ORA and GSEA enrichment tool. @@ -85,6 +88,7 @@ struct CombineGmtArgs { /// Paths to the files to combine files: Vec, } + #[derive(ValueEnum, Clone)] enum NormMethods { MedianRank, @@ -93,8 +97,15 @@ enum NormMethods { None, } +#[derive(ValueEnum, Clone)] +enum CombinationMethods { + Max, + Mean, +} + #[derive(Args)] struct CombineListArgs { + combination: Option, normalization: Option, out: Option, files: Vec, @@ -211,44 +222,78 @@ fn main() { res.len() ); } - Some(Commands::Test) => { - let list1 = read_rank_file("gene.rnk".to_string()).unwrap(); - let list2 = read_rank_file("protein.rnk".to_string()).unwrap(); - let list3 = read_rank_file("metabolite.rnk".to_string()).unwrap(); - let lists = vec![list1, list2, list3]; - // let gmt1 = webgestalt_lib::readers::read_gmt_file("gene.gmt".to_string()).unwrap(); - // let gmt2 = - // webgestalt_lib::readers::read_gmt_file("metabolite.gmt".to_string()).unwrap(); - // let combined_gmt = webgestalt_lib::methods::multiomics::combine_gmts(&vec![gmt1, gmt2]); - // let mut file = File::create("combined.gmt").unwrap(); - // for row in combined_gmt { - // writeln!(file, "{}\t{}\t{}", row.id, row.url, row.parts.join("\t")).unwrap(); - // } - let mut combined_list = webgestalt_lib::methods::multiomics::combine_lists( - lists, - webgestalt_lib::methods::multiomics::MultiOmicsMethod::Mean, - webgestalt_lib::methods::multiomics::NormalizationMethod::MeanValue, - ); - combined_list.sort_by(|a, b| b.rank.partial_cmp(&a.rank).unwrap()); - let mut file = File::create("combined.rnk").unwrap(); - for row in combined_list { - writeln!(file, "{}\t{}", row.analyte, row.rank).unwrap(); - } - } + Some(Commands::Test) => will_err(1).unwrap_or_else(|x| println!("{}", x)), Some(Commands::Combine(args)) => match &args.combine_type { - Some(CombineType::Gmt(files)) => {} - Some(CombineType::List(files)) => { + Some(CombineType::Gmt(gmt_args)) => { + let style = Style::new().blue().bold(); + println!( + "{}: READING GMTS", + "INFO".if_supports_color(Stdout, |text| text.style(style)) + ); + let mut gmts: Vec> = Vec::new(); + let mut tot_length: usize = 0; + for path in gmt_args.files.clone() { + let gmt = read_gmt_file(path).unwrap(); + tot_length += gmt.len(); + gmts.push(gmt); + } + let combined_gmt = combine_gmts(&gmts); + println!( + "Found {} overlapping sets out of {}", + tot_length - combined_gmt.len(), + combined_gmt.len() + ); + println!( + "{}: CREATING COMBINED GMT AT {}", + "INFO".if_supports_color(Stdout, |text| text.style(style)), + gmt_args.out.clone().unwrap() + ); + let mut file = File::create(gmt_args.out.clone().unwrap()).unwrap(); + for row in combined_gmt { + writeln!(file, "{}\t{}\t{}", row.id, row.url, row.parts.join("\t")).unwrap(); + } + } + Some(CombineType::List(ora_args)) => { + let style = Style::new().blue().bold(); + println!( + "{}: READING LISTS", + "INFO".if_supports_color(Stdout, |text| text.style(style)) + ); let mut lists = Vec::new(); - for file in files.files.iter() { + for file in ora_args.files.iter() { lists.push(read_rank_file(file.clone()).unwrap()); } + let norm_method: NormalizationMethod = match ora_args.normalization { + Some(NormMethods::None) => NormalizationMethod::None, + Some(NormMethods::MeanValue) => NormalizationMethod::MeanValue, + Some(NormMethods::MedianRank) => NormalizationMethod::MedianRank, + Some(NormMethods::MedianValue) => NormalizationMethod::MedianValue, + None => panic!("No normalization method chosen."), + }; + let method: MultiOmicsMethod = match ora_args.combination { + Some(CombinationMethods::Mean) => MultiOmicsMethod::Mean(norm_method), + Some(CombinationMethods::Max) => MultiOmicsMethod::Max(norm_method), + None => panic!("No combination method chosen."), + }; + let mut combined_list = + webgestalt_lib::methods::multiomics::combine_lists(lists, method); + combined_list.sort_by(|a, b| b.rank.partial_cmp(&a.rank).unwrap()); + let mut file = File::create(ora_args.out.clone().unwrap()).unwrap(); + println!( + "{}: CREATING COMBINED LIST AT {}", + "INFO".if_supports_color(Stdout, |text| text.style(style)), + ora_args.out.clone().unwrap() + ); + for row in combined_list { + writeln!(file, "{}\t{}", row.analyte, row.rank).unwrap(); + } } _ => { - panic!("Please select a valid combine type"); + println!("Please select a valid combine type"); } }, _ => { - todo!("Please select a valid command. Run --help for options.") + println!("Please select a valid command. Run --help for options.") } } } @@ -288,3 +333,17 @@ fn benchmark() { let mut ftsv = File::create("format_benchmarks.tsv").unwrap(); writeln!(ftsv, "{}", whole_file.join("\n")).unwrap(); } + +fn will_err(x: i32) -> Result<(), WebGestaltError> { + if x == 0 { + Ok(()) + } else { + Err(WebGestaltError::MalformedFile(MalformedError { + path: String::from("ExamplePath.txt"), + kind: webgestalt_lib::MalformedErrorType::WrongFormat { + found: String::from("GMT"), + expected: String::from("rank"), + }, + })) + } +} diff --git a/webgestalt_lib/src/lib.rs b/webgestalt_lib/src/lib.rs index 62fb2f0..05d9eac 100644 --- a/webgestalt_lib/src/lib.rs +++ b/webgestalt_lib/src/lib.rs @@ -1,23 +1,75 @@ +use std::{error::Error, fmt}; + pub mod methods; pub mod readers; pub mod stat; -pub enum Error { + +trait CustomError { + fn msg(&self) -> String; +} + +#[derive(Debug)] +pub enum WebGestaltError { MalformedFile(MalformedError), + StatisticsError(StatisticsError), IOError(std::io::Error), } -pub enum MalformedError { - NoColumnsFound, - WrongFormat, +impl Error for WebGestaltError {} + +impl fmt::Display for WebGestaltError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let msg: String = match &self { + WebGestaltError::MalformedFile(x) => x.msg(), + WebGestaltError::StatisticsError(x) => x.msg(), + WebGestaltError::IOError(x) => x.to_string(), + }; + write!(f, "{}", msg) + } +} + +#[derive(Debug)] +pub struct MalformedError { + pub path: String, + pub kind: MalformedErrorType, +} + +#[derive(Debug)] +pub enum MalformedErrorType { + NoColumnsFound { delimeter: String }, + WrongFormat { found: String, expected: String }, Unknown, } +impl CustomError for MalformedError { + fn msg(&self) -> String { + let error_msg = match &self.kind { + MalformedErrorType::WrongFormat { found, expected } => format!( + "Wrong Format Found. Found: {}; Expected: {}", + found, expected + ), + MalformedErrorType::Unknown => String::from("Unknown error type."), + MalformedErrorType::NoColumnsFound { delimeter } => format!( + "No column found with delimeter {}", + if delimeter == "\t" { "\\t" } else { delimeter } + ), + }; + format!("Error in {}: {}.", self.path, error_msg) + } +} + +#[derive(Debug)] pub enum StatisticsError { FoundNANValue, - InvalidValue, + InvalidValue { value: f64 }, } -#[cfg(test)] -mod tests { - use super::*; +impl CustomError for StatisticsError { + fn msg(&self) -> String { + let error_msg = match &self { + StatisticsError::FoundNANValue => String::from("Found a NAN value"), + StatisticsError::InvalidValue { value } => format!("Found invalid value: {}", value), + }; + format!("Statstical Error: {}.", error_msg) + } } diff --git a/webgestalt_lib/src/methods/gsea.rs b/webgestalt_lib/src/methods/gsea.rs index 9f6481f..f013404 100644 --- a/webgestalt_lib/src/methods/gsea.rs +++ b/webgestalt_lib/src/methods/gsea.rs @@ -7,6 +7,7 @@ use rayon::prelude::*; use std::sync::{Arc, Mutex}; /// Parameters for GSEA +#[derive(Clone)] pub struct GSEAConfig { /// Power to raise each rank during the enrichment scoring pub p: f64, @@ -62,12 +63,19 @@ impl GSEAResult { #[derive(Clone)] pub struct FullGSEAResult { + /// The set name pub set: String, + /// The statistical p-value pub p: f64, + /// The FDR value pub fdr: f64, + /// The enrichment score pub es: f64, + /// The normalized enrichment score pub nes: f64, + /// Leading edge count pub leading_edge: i32, + /// Running sum vector pub running_sum: Vec, } @@ -276,6 +284,10 @@ fn enrichment_score( /// /// - `analyte_list` - [`Vec`] of the rank list /// - `gmt` - [`Vec`] of gmt file +/// +/// # Returns +/// +/// Returns a [`Vec`] of the GSEA results pub fn gsea( mut analyte_list: Vec, gmt: Vec, @@ -286,7 +298,7 @@ pub fn gsea( analyte_list.sort_by(|a, b| b.rank.partial_cmp(&a.rank).unwrap()); // sort list let (analytes, ranks) = RankListItem::to_vecs(analyte_list.clone()); // seperate into vectors let permutations: Vec> = - provided_permutations.unwrap_or(make_permuations(config.permutations, analytes.len())); + provided_permutations.unwrap_or(make_permutations(config.permutations, analytes.len())); let all_nes = Arc::new(Mutex::new(Vec::new())); let set_nes = Arc::new(Mutex::new(Vec::new())); let all_res = Arc::new(Mutex::new(Vec::new())); @@ -359,7 +371,26 @@ pub fn gsea( final_gsea } -pub fn make_permuations(permutations: i32, max: usize) -> Vec> { +/// Create index permutations for GSEA +/// +/// # Parameters +/// +/// - `permutations` - Number of permutations to create +/// - `max` - Maximum index to permute +/// +/// # Returns +/// +/// Returns a [`Vec>`] of the permutations +/// +/// # Examples +/// +/// ``` +/// use webgestalt_lib::methods::gsea::make_permutations; +/// let permutations = make_permutations(10, 100); +/// assert_eq!(permutations.len(), 10); +/// assert_eq!(permutations[0].len(), 100); +/// ``` +pub fn make_permutations(permutations: i32, max: usize) -> Vec> { let mut temp_permutations: Vec> = Vec::new(); let mut smallrng = rand::rngs::SmallRng::from_entropy(); (0..permutations).for_each(|_i| { diff --git a/webgestalt_lib/src/methods/multiomics.rs b/webgestalt_lib/src/methods/multiomics.rs index 25145d2..1de5a47 100644 --- a/webgestalt_lib/src/methods/multiomics.rs +++ b/webgestalt_lib/src/methods/multiomics.rs @@ -1,16 +1,17 @@ use ahash::{AHashMap, AHashSet}; +use statrs::distribution::{Continuous, ContinuousCDF, Normal}; use super::{ - gsea::{GSEAConfig, RankListItem}, + gsea::{FullGSEAResult, GSEAConfig, RankListItem}, ora::ORAConfig, }; -use crate::readers::utils::Item; +use crate::{methods::gsea::gsea, readers::utils::Item}; pub enum MultiOmicsMethod { /// Get the max median ratio of the analyte from any list - Max, + Max(NormalizationMethod), /// Get the average median ratio of analyte from all the lists - Mean, + Mean(NormalizationMethod), /// Run each list separately and calculate a meta-p value Meta(MetaAnalysisMethod), } @@ -27,11 +28,6 @@ pub enum AnalysisType { ORA, } -pub enum AnalysisJob<'a> { - GSEAJob(GSEAJob<'a>), - ORAJob(ORAJob<'a>), -} - pub struct GSEAJob<'a> { pub gmt: &'a Vec, pub rank_list: &'a Vec, @@ -58,29 +54,79 @@ pub enum NormalizationMethod { /// /// # Parameters /// -/// - `jobs` - A [`Vec`] containing all of the seperates 'jobs' or analysis to combine -/// - `analysis_type` - A [`AnalysisType`] enum of the analysis type (GSEA, ORA, or NTA) to run -/// - `method` - A [`MultiOmicsMethod`] enum detailing the analysis method to combine the runs -/// together (meta-analysis, mean median ration, or max median ratio). -pub fn multiomic_analysis( - _jobs: Vec, - _analysis_type: AnalysisType, - method: MultiOmicsMethod, -) { +/// - `jobs` - A [`Vec`] containing all of the seperates 'jobs' or analysis to combine +/// - `method` - A [`MultiOmicsMethod`] enum detailing the analysis method to combine the runs together (meta-analysis, mean median ration, or max median ratio). +/// +/// # Returns +/// +/// Returns a [`Vec>`] containing the results of each analysis. If the method was not meta-analysis, then the outer vector will only have one element. +/// If the method was meta-analysis, then the first element will be the results of the meta-analysis, and the rest of the elements will be the results of each analysis run individually. +pub fn multiomic_gsea(jobs: Vec, method: MultiOmicsMethod) -> Vec> { if let MultiOmicsMethod::Meta(meta_method) = method { + let mut phash: AHashMap> = AHashMap::default(); + let mut results: Vec> = Vec::new(); + for job in jobs { + let res = gsea(job.rank_list.to_vec(), job.gmt.to_vec(), job.config, None); + for row in res.iter() { + let set = row.set.clone(); + phash.entry(set).or_default().push(row.p); + } + results.push(res); + } + let mut final_result: Vec = Vec::new(); + match meta_method { + MetaAnalysisMethod::Stouffer => { + let normal = Normal::new(0.0, 1.0).unwrap(); + for set in phash.keys() { + final_result.push(FullGSEAResult { + set: set.clone(), + p: stouffer_with_normal(&phash[set], &normal), + fdr: 0.0, + nes: 0.0, + es: 0.0, + running_sum: Vec::new(), + leading_edge: 0, + }); + } + } + MetaAnalysisMethod::Fisher => { + for set in phash.keys() { + final_result.push(FullGSEAResult { + set: set.clone(), + p: fisher(&phash[set]), + fdr: 0.0, + nes: 0.0, + es: 0.0, + running_sum: Vec::new(), + leading_edge: 0, + }); + } + } + } + results.insert(0, final_result); + results } else { + let lists = jobs.iter().map(|x| x.rank_list.clone()).collect(); + let combined_list = combine_lists(lists, method); + let gmts = jobs.iter().map(|x| x.gmt.clone()).collect(); + let combined_gmt = combine_gmts(&gmts); + vec![gsea( + combined_list, + combined_gmt, + jobs.first().unwrap().config.clone(), + None, + )] } } pub fn combine_lists( lists: Vec>, combination_method: MultiOmicsMethod, - normalization_method: NormalizationMethod, ) -> Vec { match combination_method { - MultiOmicsMethod::Max => max_combine(lists, normalization_method), - MultiOmicsMethod::Mean => mean_combine(lists, normalization_method), - MultiOmicsMethod::Meta(_x) => panic!("Lists can not be combine for meta-analysis"), + MultiOmicsMethod::Max(normalization_method) => max_combine(lists, normalization_method), + MultiOmicsMethod::Mean(normalization_method) => mean_combine(lists, normalization_method), + MultiOmicsMethod::Meta(_) => panic!("Lists can not be combined for meta-analysis"), } } @@ -173,7 +219,7 @@ fn normalize(list: &mut Vec, method: NormalizationMethod) -> Vec, method: NormalizationMethod) -> Vec>) -> Vec { } final_gmt } + +/// Calculates meta-p values using the Stouffer method ([DOI:10.1037/h0051438](https://doi.org/10.1037/h0051438)) of `vals` +/// +/// # Arguments +/// - `val` - `Vec` of p-values to combine +/// +/// # Examples +/// +/// ```rust +/// use webgestalt_lib::methods::multiomics::stouffer; +/// let vals: Vec = vec![0.1, 0.01, 0.11, 0.23]; +/// let metap: f64 = stouffer(vals); +/// ``` +pub fn stouffer(vals: Vec) -> f64 { + let n = Normal::new(0.0, 1.0).unwrap(); + let k = vals.len(); + n.cdf(vals.iter().map(|x| n.inverse_cdf(*x)).sum::() / f64::sqrt(k as f64)) +} + +fn stouffer_with_normal(vals: &Vec, normal: &Normal) -> f64 { + let k = vals.len(); + normal.cdf(vals.iter().map(|x| normal.inverse_cdf(*x)).sum::() / f64::sqrt(k as f64)) +} + +pub fn fisher(vals: &Vec) -> f64 { + let k = vals.len(); + let pt = -2.0 * vals.iter().map(|x| x.ln()).sum::(); + let dist = statrs::distribution::ChiSquared::new(2_f64.powi(k as i32 - 1)).unwrap(); + dist.pdf(pt) +} + +/// Calculates meta-p values using the Stouffer weighted method ([10.1214/aoms/1177698861](https://doi.org/10.1214/aoms/1177698861)) of `vals` with weights in `weights` +/// +/// # Arguments +/// - `val` - [`Vec`] of p-values to combine +/// - `weights` - [`Vec`] of weights corresponding to each p-value +/// +/// # Examples +/// +/// ```rust +/// use webgestalt_lib::methods::multiomics::stouffer_weighted; +/// let vals: Vec = vec![0.1, 0.01, 0.11, 0.23]; +/// let weights: Vec = vec![0.1, 0.2, 0.3, 0.4]; +/// let metap: f64 = stouffer_weighted(vals, weights); +/// ``` +pub fn stouffer_weighted(vals: Vec, weights: Vec) -> f64 { + let n = Normal::new(0.0, 1.0).unwrap(); + n.cdf( + vals.iter() + .enumerate() + .map(|(i, x)| weights[i] * n.inverse_cdf(*x)) + .sum::() + / f64::sqrt(weights.iter().map(|x| x * x).sum::()), + ) +}