diff --git a/mzLib/FlashLFQ/ChromatographicPeak.cs b/mzLib/FlashLFQ/ChromatographicPeak.cs index 9041eab66..d7bac2195 100644 --- a/mzLib/FlashLFQ/ChromatographicPeak.cs +++ b/mzLib/FlashLFQ/ChromatographicPeak.cs @@ -1,9 +1,9 @@ -using MzLibUtil; -using System; +using System; using System.Collections.Generic; using System.Linq; using System.Text; using ClassExtensions = Chemistry.ClassExtensions; +using FlashLFQ.PEP; namespace FlashLFQ { @@ -16,20 +16,23 @@ public class ChromatographicPeak public int ScanCount => IsotopicEnvelopes.Count; public double SplitRT; public readonly bool IsMbrPeak; - public double PredictedRetentionTime { get; init; } - + public double MbrScore; + public double PpmScore { get; set; } + public double IntensityScore { get; set; } + public double RtScore { get; set; } + public double ScanCountScore { get; set; } + public double IsotopicDistributionScore { get; set; } /// - /// A score bounded by 100 and 0, with more confident MBR-detections receiving higher scores + /// Stores the pearson correlation between the apex isotopic envelope and the theoretical isotopic distribution /// - public double MbrScore { get; private set; } - - /// The four scores below are bounded by 0 and 1, with higher scores being better - public double PpmScore { get; private set; } - public double IntensityScore { get; private set; } - public double RtScore { get; private set; } - public double ScanCountScore { get; private set; } - - public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo) + public double IsotopicPearsonCorrelation => Apex?.PearsonCorrelation ?? -1; + public double RtPredictionError { get; set; } + public List ChargeList { get; set; } + internal double MbrQValue { get; set; } + public ChromatographicPeakData PepPeakData { get; set; } + public double? MbrPep { get; set; } + + public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo, bool randomRt = false) { SplitRT = 0; NumChargeStatesObserved = 0; @@ -40,12 +43,14 @@ public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fi IsotopicEnvelopes = new List(); IsMbrPeak = isMbrPeak; SpectraFileInfo = fileInfo; + RandomRt = randomRt; } - public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo, double predictedRetentionTime) : - this(id, isMbrPeak, fileInfo) + public bool Equals(ChromatographicPeak peak) { - PredictedRetentionTime = predictedRetentionTime; + return SpectraFileInfo.Equals(peak.SpectraFileInfo) + && Identifications.First().ModifiedSequence.Equals(peak.Identifications.First().ModifiedSequence) + && ApexRetentionTime == peak.ApexRetentionTime; } public IsotopicEnvelope Apex { get; private set; } @@ -54,13 +59,18 @@ public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fi public int NumIdentificationsByBaseSeq { get; private set; } public int NumIdentificationsByFullSeq { get; private set; } public double MassError { get; private set; } + /// + /// Bool that describes whether the retention time of this peak was randomized + /// If true, implies that this peak is a decoy peak identified by the MBR algorithm + /// + public bool RandomRt { get; } + public bool DecoyPeptide => Identifications.First().IsDecoy; public void CalculateIntensityForThisFeature(bool integrate) { if (IsotopicEnvelopes.Any()) { - double maxIntensity = IsotopicEnvelopes.Max(p => p.Intensity); - Apex = IsotopicEnvelopes.First(p => p.Intensity == maxIntensity); + Apex = IsotopicEnvelopes.MaxBy(p => p.Intensity); if (integrate) { @@ -123,25 +133,6 @@ public void ResolveIdentifications() this.NumIdentificationsByBaseSeq = Identifications.Select(v => v.BaseSequence).Distinct().Count(); this.NumIdentificationsByFullSeq = Identifications.Select(v => v.ModifiedSequence).Distinct().Count(); } - - /// - /// Calculates four component scores and one overarching Mbr score for an MBR peak. - /// MBR Score is equal to 100 * the geometric mean of the four component scores. - /// - /// An MbrScorer specific to the file where this peak was found - /// The donor peak used as the basis for the MBR identification. - internal void CalculateMbrScore(MbrScorer scorer, ChromatographicPeak donorPeak) - { - if (SpectraFileInfo != scorer.AcceptorFile) throw new MzLibException("Error when performing match-between-runs: Mismatch between scorer and peak."); - - IntensityScore = scorer.CalculateIntensityScore(this, donorPeak); - RtScore = scorer.CalculateRetentionTimeScore(this, donorPeak); - PpmScore = scorer.CalculatePpmErrorScore(this); - ScanCountScore = scorer.CalculateScanCountScore(this); - - MbrScore = 100 * Math.Pow(IntensityScore * RtScore * PpmScore * ScanCountScore, 0.25); - } - public static string TabSeparatedHeader { get @@ -164,20 +155,18 @@ public static string TabSeparatedHeader sb.Append("Peak Charge" + "\t"); sb.Append("Num Charge States Observed" + "\t"); sb.Append("Peak Detection Type" + "\t"); - sb.Append("MBR Score" + "\t"); - sb.Append("Ppm Score" + "\t"); - sb.Append("Intensity Score" + "\t"); - sb.Append("Rt Score" + "\t"); - sb.Append("Scan Count Score" + "\t"); + sb.Append("PIP Q-Value" + "\t"); + sb.Append("PIP PEP" + "\t"); sb.Append("PSMs Mapped" + "\t"); sb.Append("Base Sequences Mapped" + "\t"); sb.Append("Full Sequences Mapped" + "\t"); sb.Append("Peak Split Valley RT" + "\t"); - sb.Append("Peak Apex Mass Error (ppm)"); + sb.Append("Peak Apex Mass Error (ppm)" + "\t"); + sb.Append("Decoy Peptide" + "\t"); + sb.Append("Random RT"); return sb.ToString(); } } - public override string ToString() { StringBuilder sb = new StringBuilder(); @@ -260,17 +249,16 @@ public override string ToString() sb.Append("" + "MSMS" + "\t"); } - sb.Append("" + (IsMbrPeak ? MbrScore.ToString() : "") + "\t"); - sb.Append("" + (IsMbrPeak ? PpmScore.ToString() : "") + "\t"); - sb.Append("" + (IsMbrPeak ? IntensityScore.ToString() : "") + "\t"); - sb.Append("" + (IsMbrPeak ? RtScore.ToString() : "") + "\t"); - sb.Append("" + (IsMbrPeak ? ScanCountScore.ToString() : "") + "\t"); + sb.Append("" + (IsMbrPeak ? MbrQValue.ToString() : "") + "\t"); + sb.Append("" + (IsMbrPeak ? MbrPep.ToString() : "") + "\t"); sb.Append("" + Identifications.Count + "\t"); sb.Append("" + NumIdentificationsByBaseSeq + "\t"); sb.Append("" + NumIdentificationsByFullSeq + "\t"); sb.Append("" + SplitRT + "\t"); sb.Append("" + MassError); + sb.Append("\t" + DecoyPeptide); + sb.Append("\t" + RandomRt); return sb.ToString(); } diff --git a/mzLib/FlashLFQ/FlashLFQ.csproj b/mzLib/FlashLFQ/FlashLFQ.csproj index 097bf6131..d5c466967 100644 --- a/mzLib/FlashLFQ/FlashLFQ.csproj +++ b/mzLib/FlashLFQ/FlashLFQ.csproj @@ -13,6 +13,8 @@ + + diff --git a/mzLib/FlashLFQ/FlashLFQResults.cs b/mzLib/FlashLFQ/FlashLFQResults.cs index 9e5f8e0e4..b20daa7ea 100644 --- a/mzLib/FlashLFQ/FlashLFQResults.cs +++ b/mzLib/FlashLFQ/FlashLFQResults.cs @@ -15,27 +15,26 @@ public class FlashLfqResults public readonly Dictionary ProteinGroups; public readonly Dictionary> Peaks; private readonly HashSet _peptideModifiedSequencesToQuantify; + public string PepResultString { get; set; } + public double MbrQValueThreshold { get; set; } - public FlashLfqResults(List spectraFiles, List identifications, HashSet peptides = null) + public FlashLfqResults(List spectraFiles, List identifications, double mbrQValueThreshold = 0.05, + HashSet peptideModifiedSequencesToQuantify = null) { SpectraFiles = spectraFiles; PeptideModifiedSequences = new Dictionary(); ProteinGroups = new Dictionary(); Peaks = new Dictionary>(); - if(peptides == null || !peptides.Any()) - { - peptides = identifications.Select(id => id.ModifiedSequence).ToHashSet(); - } - _peptideModifiedSequencesToQuantify = peptides; + MbrQValueThreshold = mbrQValueThreshold; + _peptideModifiedSequencesToQuantify = peptideModifiedSequencesToQuantify ?? identifications.Where(id => !id.IsDecoy).Select(id => id.ModifiedSequence).ToHashSet(); foreach (SpectraFileInfo file in spectraFiles) { Peaks.Add(file, new List()); } - // Only quantify peptides within the set of valid peptide modified (full) sequences. This is done to enable pepitde-level FDR control of reported results - foreach (Identification id in identifications.Where(id => peptides.Contains(id.ModifiedSequence))) + foreach (Identification id in identifications.Where(id => !id.IsDecoy & _peptideModifiedSequencesToQuantify.Contains(id.ModifiedSequence))) { if (!PeptideModifiedSequences.TryGetValue(id.ModifiedSequence, out Peptide peptide)) { @@ -59,6 +58,17 @@ public FlashLfqResults(List spectraFiles, List } } + public void ReNormalizeResults(bool integrate = false, int maxThreads = 10, bool useSharedPeptides = false) + { + foreach(var peak in Peaks.SelectMany(p => p.Value)) + { + peak.CalculateIntensityForThisFeature(integrate); + } + new IntensityNormalizationEngine(this, integrate, silent: true, maxThreads).NormalizeResults(); + CalculatePeptideResults(quantifyAmbiguousPeptides: false); + CalculateProteinResultsMedianPolish(useSharedPeptides: useSharedPeptides); + } + public void MergeResultsWith(FlashLfqResults mergeFrom) { this.SpectraFiles.AddRange(mergeFrom.SpectraFiles); @@ -128,6 +138,8 @@ public void CalculatePeptideResults(bool quantifyAmbiguousPeptides) { var groupedPeaks = filePeaks.Value .Where(p => p.NumIdentificationsByFullSeq == 1) + .Where(p => !p.Identifications.First().IsDecoy) + .Where(p => !p.IsMbrPeak || (p.MbrQValue < MbrQValueThreshold && !p.RandomRt)) .GroupBy(p => p.Identifications.First().ModifiedSequence) .Where(group => _peptideModifiedSequencesToQuantify.Contains(group.Key)) .ToList(); @@ -163,11 +175,15 @@ public void CalculatePeptideResults(bool quantifyAmbiguousPeptides) // report ambiguous quantification var ambiguousPeaks = filePeaks.Value .Where(p => p.NumIdentificationsByFullSeq > 1) + .Where(p => !p.Identifications.First().IsDecoy) + .Where(p => !p.IsMbrPeak || (p.MbrQValue < MbrQValueThreshold && !p.RandomRt)) .ToList(); foreach (ChromatographicPeak ambiguousPeak in ambiguousPeaks) { - foreach (Identification id in ambiguousPeak.Identifications) + foreach (Identification id in ambiguousPeak.Identifications.Where(id => !id.IsDecoy)) { + if (!_peptideModifiedSequencesToQuantify.Contains(id.ModifiedSequence)) continue; // Ignore the ids/sequences we don't want to quantify + string sequence = id.ModifiedSequence; double alreadyRecordedIntensity = PeptideModifiedSequences[sequence].GetIntensity(filePeaks.Key); @@ -224,7 +240,7 @@ private void HandleAmbiguityInFractions() foreach (SpectraFileInfo file in sample) { - foreach (ChromatographicPeak peak in Peaks[file]) + foreach (ChromatographicPeak peak in Peaks[file].Where(p => !p.IsMbrPeak || p.MbrQValue < MbrQValueThreshold)) { foreach (Identification id in peak.Identifications) { @@ -549,7 +565,7 @@ public void CalculateProteinResultsMedianPolish(bool useSharedPeptides) } } - public void WriteResults(string peaksOutputPath, string modPeptideOutputPath, string proteinOutputPath, string bayesianProteinQuantOutput, bool silent = true) + public void WriteResults(string peaksOutputPath, string modPeptideOutputPath, string proteinOutputPath, string bayesianProteinQuantOutput, bool silent) { if (!silent) { diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 712eae6a1..01d3c6dc0 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -12,11 +12,21 @@ using UsefulProteomicsDatabases; using System.Runtime.CompilerServices; using Easy.Common.Extensions; +using FlashLFQ.PEP; +using System.IO; +using System.Threading; [assembly: InternalsVisibleTo("TestFlashLFQ")] namespace FlashLFQ { + public enum DonorCriterion + { + Score, + Intensity, + Neighbors + } + public class FlashLfqEngine { // settings @@ -38,9 +48,24 @@ public class FlashLfqEngine public readonly bool MatchBetweenRuns; public readonly double MbrRtWindow; public readonly double MbrPpmTolerance; - public readonly bool RequireMsmsIdInCondition; + public readonly double MbrDetectionQValueThreshold; private int _numberOfAnchorPeptidesForMbr = 3; // the number of anchor peptides used for local alignment when predicting retention times of MBR acceptor peptides + // New MBR Settings + public readonly double RtWindowIncrease = 0; + public readonly double MbrAlignmentWindow = 2.5; + public readonly double PepTrainingFraction = 0.25; + /// + /// Specifies how the donor peak for MBR is selected. + /// 'Score' selects the donor peak associated with the highest scoring PSM + /// 'Intensity' selects the donor peak with the max intensity + /// 'Neighbors' selects the donor peak with the most neighboring peaks + /// + public DonorCriterion DonorCriterion { get; init; } + public readonly double DonorQValueThreshold; + public readonly bool RequireMsmsIdInCondition; + private int _randomSeed = 42; + // settings for the Bayesian protein quantification engine public readonly bool BayesianProteinQuant; @@ -61,7 +86,7 @@ public class FlashLfqEngine /// Other peptides may appear in the QuantifiedPeaks output, but this list is used to enable /// peptide-level FDR filtering /// - public HashSet PeptidesModifiedSequencesToQuantify { get; init; } + public HashSet PeptideModifiedSequencesToQuantify { get; init; } /// /// Dictionary linking a modified sequence to a List of tuples containing /// the mass shifts (isotope mass - monoisotopic mass) and normalized abundances for the @@ -72,6 +97,7 @@ public class FlashLfqEngine private FlashLfqResults _results; internal Dictionary _ms1Scans; internal PeakIndexingEngine _peakIndexingEngine; + internal Dictionary> DonorFileToPeakDict { get; private set; } /// /// Create an instance of FlashLFQ that will quantify peptides based on their precursor intensity in MS1 spectra @@ -95,8 +121,9 @@ public FlashLfqEngine( // MBR settings bool matchBetweenRuns = false, double matchBetweenRunsPpmTolerance = 10.0, - double maxMbrWindow = 2.5, + double maxMbrWindow = 1.0, bool requireMsmsIdInCondition = false, + double matchBetweenRunsFdrThreshold = 0.05, // settings for the Bayesian protein quantification engine bool bayesianProteinQuant = false, @@ -106,8 +133,10 @@ public FlashLfqEngine( int mcmcBurninSteps = 1000, bool useSharedPeptidesForProteinQuant = false, bool pairedSamples = false, - List peptideSequencesToUse = null, - int? randomSeed = null) + int? randomSeed = null, + DonorCriterion donorCriterion = DonorCriterion.Score, + double donorQValueThreshold = 0.01, + List peptideSequencesToQuantify = null) { Loaders.LoadElements(); @@ -122,17 +151,17 @@ public FlashLfqEngine( .ThenBy(p => p.TechnicalReplicate).ToList(); _allIdentifications = allIdentifications; + PeptideModifiedSequencesToQuantify = peptideSequencesToQuantify.IsNotNullOrEmpty() + ? new HashSet(peptideSequencesToQuantify) + : allIdentifications.Select(id => id.ModifiedSequence).ToHashSet(); PpmTolerance = ppmTolerance; IsotopePpmTolerance = isotopeTolerancePpm; - MatchBetweenRuns = matchBetweenRuns; - MbrPpmTolerance = matchBetweenRunsPpmTolerance; + Integrate = integrate; NumIsotopesRequired = numIsotopesRequired; QuantifyAmbiguousPeptides = quantifyAmbiguousPeptides; Silent = silent; IdSpecificChargeState = idSpecificChargeState; - MbrRtWindow = maxMbrWindow; - RequireMsmsIdInCondition = requireMsmsIdInCondition; Normalize = normalize; MaxThreads = maxThreads; @@ -143,8 +172,14 @@ public FlashLfqEngine( McmcSteps = mcmcSteps; McmcBurninSteps = mcmcBurninSteps; UseSharedPeptidesForProteinQuant = useSharedPeptidesForProteinQuant; - PeptidesModifiedSequencesToQuantify = peptideSequencesToUse.IsNotNullOrEmpty() ? new HashSet(peptideSequencesToUse) - : allIdentifications.Select(id => id.ModifiedSequence).ToHashSet(); + + // MBR settings + MatchBetweenRuns = matchBetweenRuns; + MbrPpmTolerance = matchBetweenRunsPpmTolerance; + MbrRtWindow = maxMbrWindow; + DonorCriterion = donorCriterion; + DonorQValueThreshold = donorQValueThreshold; + MbrDetectionQValueThreshold = matchBetweenRunsFdrThreshold; RandomSeed = randomSeed; if (MaxThreads == -1 || MaxThreads >= Environment.ProcessorCount) @@ -166,7 +201,7 @@ public FlashLfqResults Run() { _globalStopwatch.Start(); _ms1Scans = new Dictionary(); - _results = new FlashLfqResults(_spectraFileInfo, _allIdentifications, PeptidesModifiedSequencesToQuantify); + _results = new FlashLfqResults(_spectraFileInfo, _allIdentifications, MbrDetectionQValueThreshold, PeptideModifiedSequencesToQuantify); // build m/z index keys CalculateTheoreticalIsotopeDistributions(); @@ -206,6 +241,8 @@ public FlashLfqResults Run() // do MBR if (MatchBetweenRuns) { + Console.WriteLine("Find the best donors for match-between-runs"); + FindPeptideDonorFiles(); foreach (var spectraFile in _spectraFileInfo) { if (!Silent) @@ -214,7 +251,6 @@ public FlashLfqResults Run() } QuantifyMatchBetweenRunsPeaks(spectraFile); - _peakIndexingEngine.ClearIndex(); if (!Silent) @@ -222,6 +258,14 @@ public FlashLfqResults Run() Console.WriteLine("Finished MBR for " + spectraFile.FilenameWithoutExtension); } } + + Console.WriteLine("Computing PEP for MBR Transfers"); + bool pepSuccesful = RunPEPAnalysis(); + + foreach (var spectraFile in _spectraFileInfo) + { + CalculateFdrForMbrPeaks(spectraFile, pepSuccesful); + } } // normalize @@ -489,52 +533,59 @@ private void QuantifyMs2IdentifiedPeptides(SpectraFileInfo fileInfo) _results.Peaks[fileInfo].AddRange(chromatographicPeaks.ToList()); } + #region MatchBetweenRuns /// /// Used by the match-between-runs algorithm to determine systematic retention time drifts between /// chromatographic runs. /// - private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, SpectraFileInfo acceptor, MbrScorer scorer) - { + private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, SpectraFileInfo acceptor, MbrScorer scorer, + out List donorFileBestMsmsPeaksOrderedByMass) + { Dictionary donorFileBestMsmsPeaks = new(); Dictionary acceptorFileBestMsmsPeaks = new(); List rtCalibrationCurve = new(); List anchorPeptideRtDiffs = new(); // anchor peptides are peptides that were MS2 detected in both the donor and acceptor runs + Dictionary> donorFileAllMsmsPeaks = _results.Peaks[donor] + .Where(peak => peak.NumIdentificationsByFullSeq == 1 + && !peak.IsMbrPeak + && peak.IsotopicEnvelopes.Any() + && peak.Identifications.Min(id => id.QValue) < DonorQValueThreshold) + .GroupBy(peak => peak.Identifications.First().ModifiedSequence) + .ToDictionary(group => group.Key, group => group.ToList()); - // get all peaks, not counting ambiguous peaks - IEnumerable donorPeaks = _results.Peaks[donor].Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1); - IEnumerable acceptorPeaks = _results.Peaks[acceptor].Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1); - - // get the best (most intense) peak for each peptide in the acceptor file - foreach (ChromatographicPeak acceptorPeak in acceptorPeaks) + // iterate through each unique donor sequence + foreach (var sequencePeakListKvp in donorFileAllMsmsPeaks) { - if (acceptorFileBestMsmsPeaks.TryGetValue(acceptorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) - { - if (currentBestPeak.Intensity > acceptorPeak.Intensity) - { - acceptorFileBestMsmsPeaks[acceptorPeak.Identifications.First().ModifiedSequence] = acceptorPeak; - } - } - else - { - acceptorFileBestMsmsPeaks.Add(acceptorPeak.Identifications.First().ModifiedSequence, acceptorPeak); - } + List peaksForPeptide = sequencePeakListKvp.Value; + if (!peaksForPeptide.Any()) + continue; + + ChromatographicPeak bestPeak = ChooseBestPeak(peaksForPeptide); + + if (bestPeak == null) continue; + donorFileBestMsmsPeaks.Add(sequencePeakListKvp.Key, bestPeak); } - // get the best (most intense) peak for each peptide in the donor file - foreach (ChromatographicPeak donorPeak in donorPeaks) + Dictionary> acceptorFileAllMsmsPeaks = _results.Peaks[acceptor] + .Where(peak => peak.NumIdentificationsByFullSeq == 1 + && !peak.IsMbrPeak + && peak.IsotopicEnvelopes.Any() + && peak.Identifications.Min(id => id.QValue) < DonorQValueThreshold) + .GroupBy(peak => peak.Identifications.First().ModifiedSequence) + .ToDictionary(group => group.Key, group => group.ToList()); + + // iterate through each acceptor sequence + foreach (var sequencePeakListKvp in acceptorFileAllMsmsPeaks) { - if (donorFileBestMsmsPeaks.TryGetValue(donorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) - { - if (currentBestPeak.Intensity > donorPeak.Intensity) - { - donorFileBestMsmsPeaks[donorPeak.Identifications.First().ModifiedSequence] = donorPeak; - } - } - else - { - donorFileBestMsmsPeaks.Add(donorPeak.Identifications.First().ModifiedSequence, donorPeak); - } + List peaksForPeptide = sequencePeakListKvp.Value; + if (!peaksForPeptide.Any()) + continue; + + ChromatographicPeak bestPeak = ChooseBestPeak(peaksForPeptide); + + if (bestPeak == null) continue; + acceptorFileBestMsmsPeaks.Add(sequencePeakListKvp.Key, bestPeak); } // create RT calibration curve @@ -545,7 +596,7 @@ private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, Spec if (donorFileBestMsmsPeaks.TryGetValue(peak.Key, out ChromatographicPeak donorFilePeak)) { rtCalibrationCurve.Add(new RetentionTimeCalibDataPoint(donorFilePeak, acceptorFilePeak)); - if(donorFilePeak.ApexRetentionTime > 0 && acceptorFilePeak.ApexRetentionTime > 0) + if (donorFilePeak.ApexRetentionTime > 0 && acceptorFilePeak.ApexRetentionTime > 0) { anchorPeptideRtDiffs.Add(donorFilePeak.ApexRetentionTime - acceptorFilePeak.ApexRetentionTime); } @@ -553,10 +604,89 @@ private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, Spec } scorer.AddRtPredErrorDistribution(donor, anchorPeptideRtDiffs, _numberOfAnchorPeptidesForMbr); + donorFileBestMsmsPeaksOrderedByMass = donorFileBestMsmsPeaks.Select(kvp => kvp.Value).OrderBy(p => p.Identifications.First().PeakfindingMass).ToList(); return rtCalibrationCurve.OrderBy(p => p.DonorFilePeak.Apex.IndexedPeak.RetentionTime).ToArray(); } + /// + /// For every MSMS identified peptide, selects one file that will be used as the donor + /// by finding files that contain the most peaks in the local neighborhood, + /// then writes the restults to the DonorFileToIdsDict. + /// WARNING! Strong assumption that this is called BEFORE MBR peaks are identified/assigned to the results + /// + private void FindPeptideDonorFiles() + { + DonorFileToPeakDict = new Dictionary>(); + + Dictionary> seqPeakDict = _results.Peaks + .SelectMany(kvp => kvp.Value) + .Where(peak => peak.NumIdentificationsByFullSeq == 1 + && peak.IsotopicEnvelopes.Any() + && peak.Identifications.Min(id => id.QValue) < DonorQValueThreshold) + .GroupBy(peak => peak.Identifications.First().ModifiedSequence) + .Where(group => PeptideModifiedSequencesToQuantify.Contains(group.Key)) + .ToDictionary(group => group.Key, group => group.ToList()); + + // iterate through each unique sequence + foreach (var sequencePeakListKvp in seqPeakDict) + { + List peaksForPeptide = sequencePeakListKvp.Value; + if (!peaksForPeptide.Any()) + continue; + + ChromatographicPeak bestPeak = ChooseBestPeak(peaksForPeptide); + + if (bestPeak == null) continue; + if (DonorFileToPeakDict.ContainsKey(bestPeak.SpectraFileInfo)) + { + DonorFileToPeakDict[bestPeak.SpectraFileInfo].Add(bestPeak); + } + else + { + DonorFileToPeakDict.Add(bestPeak.SpectraFileInfo, new List { bestPeak }); + } + } + } + + internal ChromatographicPeak ChooseBestPeak(List peaks) + { + ChromatographicPeak bestPeak = null; + switch (DonorCriterion) + { + case DonorCriterion.Score: // Select best peak by the PSM score + bestPeak = peaks.MaxBy(peak => peak.Identifications.Max(id => id.PsmScore)); + if (bestPeak.Identifications.First().PsmScore > 0) + break; + else // if every ID has a score of zero, let it fall through to the default case + goto default; + case DonorCriterion.Neighbors: // Select peak with the most neighboring peaks + int maxPeaks = 0; + foreach (var donorPeak in peaks) + { + // Count the number of neighboring peaks with unique peptides + int neighboringPeaksCount = _results.Peaks[donorPeak.SpectraFileInfo] + .Where(peak => Math.Abs(peak.ApexRetentionTime - donorPeak.ApexRetentionTime) < MbrAlignmentWindow) + .Select(peak => peak.Identifications.First().ModifiedSequence) + .Distinct() + .Count(); + + if (neighboringPeaksCount > maxPeaks) + { + maxPeaks = neighboringPeaksCount; + bestPeak = donorPeak; + } + } + break; + case DonorCriterion.Intensity: // Select the peak with the highest intensity + default: + bestPeak = peaks.MaxBy(peak => peak.Intensity); + break; + } + + return bestPeak; + } + /// /// Used by MBR. Predicts the retention time of a peak in an acceptor file based on the /// retention time of the peak in the donor file. This is done with a local alignment @@ -601,21 +731,21 @@ internal RtInfo PredictRetentionTime( int numberOfForwardAnchors = 0; // gather nearby data points - for (int r = index+1; r < rtCalibrationCurve.Length; r++) + for (int r = index + 1; r < rtCalibrationCurve.Length; r++) { double rtDiff = rtCalibrationCurve[r].DonorFilePeak.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime; - if (rtCalibrationCurve[r].AcceptorFilePeak != null + if (rtCalibrationCurve[r].AcceptorFilePeak != null && rtCalibrationCurve[r].AcceptorFilePeak.ApexRetentionTime > 0) { - if(Math.Abs(rtDiff) > 0.5) // If the rtDiff is too large, it's no longer local alignment + if (Math.Abs(rtDiff) > 0.5) // If the rtDiff is too large, it's no longer local alignment { break; } nearbyCalibrationPoints.Add(rtCalibrationCurve[r]); numberOfForwardAnchors++; - if(numberOfForwardAnchors >= _numberOfAnchorPeptidesForMbr) // We only want a handful of anchor points + if (numberOfForwardAnchors >= _numberOfAnchorPeptidesForMbr) // We only want a handful of anchor points { - break; + break; } } } @@ -642,20 +772,25 @@ internal RtInfo PredictRetentionTime( if (!nearbyCalibrationPoints.Any()) { - return null; + // If there are no nearby calibration points, return the donor peak's RT and a width of 15 seconds + return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime, width: 0.25); } // calculate difference between acceptor and donor RTs for these RT region List rtDiffs = nearbyCalibrationPoints - .Select(p => p.DonorFilePeak.ApexRetentionTime - p.AcceptorFilePeak.ApexRetentionTime) + .Select(p => p.DonorFilePeak.ApexRetentionTime - p.AcceptorFilePeak.ApexRetentionTime) .ToList(); - + double medianRtDiff = rtDiffs.Median(); - double rtRange = rtDiffs.InterquartileRange() * 4.5; - // IQR * 4.5 is roughly equivalent to 6 StdDevs, so search window extends ~3 std.devs from either side of predicted RT - // IQR is less affected by outliers than StdDev + if(rtDiffs.Count == 1) + { + // If there are no nearby calibration points, return the donor peak's RT and a width of 15 seconds + return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime - medianRtDiff, width: 0.25); + } - rtRange = Math.Min(rtRange, MbrRtWindow); + double rtRange = rtDiffs.StandardDeviation() * 6; + + rtRange = Math.Min(rtRange, MbrRtWindow); return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime - medianRtDiff, width: rtRange); } @@ -672,7 +807,8 @@ private MbrScorer BuildMbrScorer(List acceptorFileIdentifie var apexToAcceptorFilePeakDict = new Dictionary(); List ppmErrors = new List(); foreach (var peak in acceptorFileIdentifiedPeaks.Where(p => p.Apex != null - && PeptidesModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence))) + && PeptideModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence) + && p.Identifications.First().QValue < DonorQValueThreshold)) { if (!apexToAcceptorFilePeakDict.ContainsKey(peak.Apex.IndexedPeak)) { @@ -702,6 +838,56 @@ private MbrScorer BuildMbrScorer(List acceptorFileIdentifie return new MbrScorer(apexToAcceptorFilePeakDict, acceptorFileIdentifiedPeaks, ppmDistribution, logIntensityDistribution); } + /// + /// Returns a pseudo-randomly selected peak that does not have the same mass as the donor + /// + /// + /// Will search for a peak at least 5 Da away from the peakfinding mass + /// + internal ChromatographicPeak GetRandomPeak( + List peaksOrderedByMass, + double donorPeakRetentionTime, + double retentionTimeMinDiff, + Identification donorIdentification) + { + double minDiff = 5 * PeriodicTable.GetElement("H").PrincipalIsotope.AtomicMass; + double maxDiff = 11 * PeriodicTable.GetElement("H").PrincipalIsotope.AtomicMass; + double donorPeakPeakfindingMass = donorIdentification.PeakfindingMass; + + // Theoretically we could do a binary search but we're just going to iterate through the whole list of donor peaks + List randomPeakCandidates = peaksOrderedByMass + .Where(p => + p.ApexRetentionTime > 0 + && Math.Abs(p.ApexRetentionTime - donorPeakRetentionTime) > retentionTimeMinDiff + && p.Identifications.First().BaseSequence != donorIdentification.BaseSequence + && Math.Abs(p.Identifications.First().PeakfindingMass - donorPeakPeakfindingMass) > minDiff + && Math.Abs(p.Identifications.First().PeakfindingMass - donorPeakPeakfindingMass) < maxDiff) + .ToList(); + + while (!randomPeakCandidates.Any() & maxDiff < 1e5) + { + // Increase the search space by a factor of 10 and try again + maxDiff *= 10; + randomPeakCandidates = peaksOrderedByMass + .Where(p => + p.ApexRetentionTime > 0 + && Math.Abs(p.ApexRetentionTime - donorPeakRetentionTime) > retentionTimeMinDiff + && p.Identifications.First().BaseSequence != donorIdentification.BaseSequence + && Math.Abs(p.Identifications.First().PeakfindingMass - donorPeakPeakfindingMass) > minDiff + && Math.Abs(p.Identifications.First().PeakfindingMass - donorPeakPeakfindingMass) < maxDiff) + .ToList(); + } + + if (!randomPeakCandidates.Any()) + { + return null; + } + + // Generates a pseudo-random number based on the donor peak finding mass + retention time + int pseudoRandomNumber = (int)(1e5 * (donorIdentification.PeakfindingMass % 1.0) * (donorIdentification.Ms2RetentionTimeInMinutes % 1.0)) % randomPeakCandidates.Count; + return randomPeakCandidates[pseudoRandomNumber]; + } + /// /// This method maps identified peaks from other chromatographic runs ("donors") onto /// the defined chromatographic run ("acceptor"). The goal is to reduce the number of missing @@ -722,7 +908,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // these are the analytes already identified in this run. we don't need to try to match them from other runs var acceptorFileIdentifiedSequences = new HashSet(acceptorFileIdentifiedPeaks - .Where(peak => peak.IsotopicEnvelopes.Any()) + .Where(peak => peak.IsotopicEnvelopes.Any() && peak.Identifications.Min(id => id.QValue) < 0.01) .SelectMany(p => p.Identifications.Select(d => d.ModifiedSequence))); MbrScorer scorer = BuildMbrScorer(acceptorFileIdentifiedPeaks, out var mbrTol); @@ -748,24 +934,24 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } // this stores the results of MBR - var matchBetweenRunsIdentifiedPeaks = new Dictionary>>(); + ConcurrentDictionary>> matchBetweenRunsIdentifiedPeaks = new(); // map each donor file onto this file - foreach (SpectraFileInfo idDonorFile in _spectraFileInfo) + foreach (var donorFilePeakListKvp in DonorFileToPeakDict) { - if (idAcceptorFile.Equals(idDonorFile)) + if (idAcceptorFile.Equals(donorFilePeakListKvp.Key)) { continue; } // this is the list of peaks identified in the other file but not in this one ("ID donor peaks") - List idDonorPeaks = _results.Peaks[idDonorFile].Where(p => - !p.IsMbrPeak - && p.NumIdentificationsByFullSeq == 1 - && p.IsotopicEnvelopes.Any() - && PeptidesModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence) // Only do MBR for peptides that we want to quantify - && !acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence) - && (!RequireMsmsIdInCondition || p.Identifications.Any(v => v.ProteinGroups.Any(g => thisFilesMsmsIdentifiedProteins.Contains(g))))).ToList(); + List idDonorPeaks = donorFilePeakListKvp.Value + .Where(p => + !acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence) + && (!RequireMsmsIdInCondition + || p.Identifications.Any(v => v.ProteinGroups.Any(g => thisFilesMsmsIdentifiedProteins.Contains(g)))) + && this.PeptideModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence)) + .ToList(); if (!idDonorPeaks.Any()) { @@ -773,7 +959,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } bool donorSampleIsFractionated = _results.SpectraFiles - .Where(p => p.Condition == idDonorFile.Condition && p.BiologicalReplicate == idDonorFile.BiologicalReplicate) + .Where(p => p.Condition == donorFilePeakListKvp.Key.Condition && p.BiologicalReplicate == donorFilePeakListKvp.Key.BiologicalReplicate) .Select(p => p.Fraction) .Distinct() .Count() > 1; @@ -781,21 +967,22 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // We're only interested in the fold change if the conditions are different. Otherwise, we score based off of the intensities // of the acceptor file if (_spectraFileInfo.Select(p => p.Condition).Distinct().Count() > 1 - && idDonorFile.Condition != idAcceptorFile.Condition) + && donorFilePeakListKvp.Key.Condition != idAcceptorFile.Condition) { scorer.CalculateFoldChangeBetweenFiles(idDonorPeaks); } // generate RT calibration curve - RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(idDonorFile, idAcceptorFile, scorer); + RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(donorFilePeakListKvp.Key, idAcceptorFile, scorer, out var donorPeaksMassOrdered); + + // break if MBR transfers can't be scored + if (!scorer.IsValid(donorFilePeakListKvp.Key)) continue; // Loop through every MSMS id in the donor file Parallel.ForEach(Partitioner.Create(0, idDonorPeaks.Count), new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, (range, loopState) => { - var matchBetweenRunsIdentifiedPeaksThreadSpecific = new Dictionary>>(); - for (int i = range.Item1; i < range.Item2; i++) { ChromatographicPeak donorPeak = idDonorPeaks[i]; @@ -803,49 +990,91 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); if (rtInfo == null) continue; - FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaksThreadSpecific); - } + // Look for MBR target (predicted-RT peak) + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, out var bestAcceptor); + AddPeakToConcurrentDict(matchBetweenRunsIdentifiedPeaks, bestAcceptor, donorPeak.Identifications.First()); + + //Draw a random donor that has an rt sufficiently far enough away + double minimumRtDifference = rtInfo.Width*2; + ChromatographicPeak randomDonor = GetRandomPeak(donorPeaksMassOrdered, + donorPeak.ApexRetentionTime, + minimumRtDifference, + donorPeak.Identifications.First()); + + // Look for MBR decoy (random-RT peak) + ChromatographicPeak bestDecoy = null; + RtInfo decoyRtInfo = null; + if (randomDonor != null) + { + decoyRtInfo = PredictRetentionTime(rtCalibrationCurve, randomDonor, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); + if (decoyRtInfo != null) + { + //Find a decoy peak using the randomly drawn retention time + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, out bestDecoy, + randomRt: decoyRtInfo.PredictedRt); + AddPeakToConcurrentDict(matchBetweenRunsIdentifiedPeaks, bestDecoy, donorPeak.Identifications.First()); + } + } - lock (matchBetweenRunsIdentifiedPeaks) - { - foreach (var kvp in matchBetweenRunsIdentifiedPeaksThreadSpecific) + double windowWidth = Math.Max(0.5, rtInfo.Width); + // If the search turned up empty, try again with a wider search window + while (bestAcceptor == null && bestDecoy == null) { - if (matchBetweenRunsIdentifiedPeaks.TryGetValue(kvp.Key, out var list)) + windowWidth = Math.Min(windowWidth, MbrRtWindow); + rtInfo.Width = windowWidth; + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, out bestAcceptor); + AddPeakToConcurrentDict(matchBetweenRunsIdentifiedPeaks, bestAcceptor, donorPeak.Identifications.First()); + + if(decoyRtInfo != null) + { + decoyRtInfo.Width = windowWidth; + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, out bestDecoy, + randomRt: decoyRtInfo.PredictedRt); + AddPeakToConcurrentDict(matchBetweenRunsIdentifiedPeaks, bestDecoy, donorPeak.Identifications.First()); + } + if (windowWidth >= MbrRtWindow) { - foreach (var peak in kvp.Value) - { - if (list.TryGetValue(peak.Key, out List existing)) - { - foreach (var acceptorPeak in peak.Value) - { - var samePeakSameSequence = existing - .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); - - if (samePeakSameSequence != null) - { - samePeakSameSequence.Identifications.Add(acceptorPeak.Identifications.First()); - } - else - { - existing.Add(acceptorPeak); - } - } - } - else - { - list.Add(peak.Key, peak.Value); - } - } + break; } else { - matchBetweenRunsIdentifiedPeaks.Add(kvp.Key, kvp.Value); + windowWidth += 0.5; } } + } }); } + // Eliminate duplicate peaks (not sure where they come from) + foreach (var seqDictionaryKvp in matchBetweenRunsIdentifiedPeaks) + { + // Each isotopic envelope is linked to a list of ChromatographicPeaks + // Here, we remove instances where the same envelope is associated with multiple chromatographic peaks but the peaks correspond to the same donor peptide + // I don't know why this happens lol + // If multiple peaks are associated with the same envelope, and they have different associated peptide identifications, then they're kept separate. + foreach (var envelopePeakListKvp in seqDictionaryKvp.Value) + { + List bestPeaks = new(); + foreach (var peakGroup in envelopePeakListKvp.Value.GroupBy(peak => peak.Identifications.First().ModifiedSequence)) + { + bestPeaks.Add(peakGroup.MaxBy(peak => peak.MbrScore)); + } + envelopePeakListKvp.Value.Clear(); + envelopePeakListKvp.Value.AddRange(bestPeaks); + } + } + + // Create a dictionary that stores imsPeak associated with an ms/ms identified peptide + Dictionary> msmsImsPeaks = _results.Peaks[idAcceptorFile] + .Where(peak => + !peak.DecoyPeptide + && peak.Apex?.IndexedPeak != null + && PeptideModifiedSequencesToQuantify.Contains(peak.Identifications.First().ModifiedSequence)) + .Select(peak => peak.Apex.IndexedPeak) + .GroupBy(imsPeak => imsPeak.ZeroBasedMs1ScanIndex) + .ToDictionary(g => g.Key, g => g.ToList()); + // take the best result (highest scoring) for each peptide after we've matched from all the donor files foreach (var mbrIdentifiedPeptide in matchBetweenRunsIdentifiedPeaks.Where(p => !acceptorFileIdentifiedSequences.Contains(p.Key))) { @@ -855,36 +1084,101 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) continue; } - List peakHypotheses = mbrIdentifiedPeptide.Value.SelectMany(p => p.Value).OrderByDescending(p => p.MbrScore).ToList(); - - ChromatographicPeak best = peakHypotheses.First(); - - peakHypotheses.Remove(best); - - if (peakHypotheses.Count > 0) + foreach (var peakHypothesisGroup in mbrIdentifiedPeptide.Value.SelectMany(kvp => kvp.Value).OrderByDescending(p => p.MbrScore).GroupBy(p => p.RandomRt)) { - double start = best.IsotopicEnvelopes.Min(p => p.IndexedPeak.RetentionTime); - double end = best.IsotopicEnvelopes.Max(p => p.IndexedPeak.RetentionTime); + var peakHypotheses = peakHypothesisGroup.ToList(); + ChromatographicPeak best = peakHypotheses.First(); + peakHypotheses.Remove(best); - List peaksToRemoveFromHypotheses = new List(); - foreach (ChromatographicPeak peak in peakHypotheses.Where(p => p.Apex.ChargeState != best.Apex.ChargeState)) + // Discard any peaks that are already associated with an ms/ms identified peptide + while (best?.Apex?.IndexedPeak != null && msmsImsPeaks.TryGetValue(best.Apex.IndexedPeak.ZeroBasedMs1ScanIndex, out var peakList)) { - if (peak.Apex.IndexedPeak.RetentionTime > start && peak.Apex.IndexedPeak.RetentionTime < end) + if (peakList.Contains(best.Apex.IndexedPeak)) + { + if (!peakHypotheses.Any()) + { + best = null; + break; + } + best = peakHypotheses.First(); + peakHypotheses.Remove(best); + } + else { - best.MergeFeatureWith(peak, Integrate); + break; + } + } + if (best == null) continue; - peaksToRemoveFromHypotheses.Add(peak); + // merge peaks with different charge states + if (peakHypotheses.Count > 0) + { + double start = best.IsotopicEnvelopes.Min(p => p.IndexedPeak.RetentionTime); + double end = best.IsotopicEnvelopes.Max(p => p.IndexedPeak.RetentionTime); + + _results.Peaks[idAcceptorFile].Add(best); + foreach (ChromatographicPeak peak in peakHypotheses.Where(p => p.Apex.ChargeState != best.Apex.ChargeState)) + { + if (peak.Apex.IndexedPeak.RetentionTime >= start + && peak.Apex.IndexedPeak.RetentionTime <= end) + //&& Math.Abs(peak.MbrScore - best.MbrScore) / best.MbrScore < 0.25)// 25% difference is a rough heuristic, but I don't want super shitty peaks being able to supercede the intensity of a good peak! + { + if (msmsImsPeaks.TryGetValue(peak.Apex.IndexedPeak.ZeroBasedMs1ScanIndex, out var peakList) && peakList.Contains(peak.Apex.IndexedPeak)) + { + continue; // If the peak is already accounted for, skip it. + } + else + { + best.MergeFeatureWith(peak, Integrate); + } + } } } + _results.Peaks[idAcceptorFile].Add(best); } - - _results.Peaks[idAcceptorFile].Add(best); } - RunErrorChecking(idAcceptorFile); } + /// + /// A concurrent dictionary is used to keep track of MBR peaks that have been identified in the acceptor file. This function updates that dictionary + /// + /// concurrent dictionary. Key = Peptide sequence. Value = ConcurrentDictionary mapping where keys are isotopic envelopes and values are list of associated peaks + /// Peak to add to the dictionary + /// The donor ID associated with the MBR peaks + private void AddPeakToConcurrentDict(ConcurrentDictionary>> matchBetweenRunsIdentifiedPeaks, + ChromatographicPeak peakToSave, + Identification donorIdentification) + { + if(peakToSave == null) + { + return; + } + // save the peak hypothesis + matchBetweenRunsIdentifiedPeaks.AddOrUpdate + ( + // new key + key: donorIdentification.ModifiedSequence, + // if we are adding a value for the first time, we simply create a new dictionatry with one entry + addValueFactory: (sequenceKey) => + new ConcurrentDictionary>( + new Dictionary> + { + { peakToSave.Apex, new List { peakToSave } } + }), + // if the key (sequence) already exists, we have to add the new peak to the existing dictionary + updateValueFactory: (sequenceKey, envelopePeakListDict) => + { + envelopePeakListDict.AddOrUpdate( + key: peakToSave.Apex, + addValueFactory: (envelopeKey) => new List { peakToSave }, // if the key (envelope) doesnt exist, just create a new list + updateValueFactory: (envelopeKey, peakList) => { peakList.Add(peakToSave); return peakList; }); // if the key (envelope) already exists, add the peak to the associated list + return envelopePeakListDict; + } + ); + } + /// /// Finds MBR acceptor peaks by looping through every possible peak for every possible charge state /// in a given retention time range. Identified peaks are added to the matchBetweenRunsIdentifiedPeaks dictionary. @@ -900,21 +1194,24 @@ internal void FindAllAcceptorPeaks( RtInfo rtInfo, Tolerance fileSpecificTol, ChromatographicPeak donorPeak, - Dictionary>> matchBetweenRunsIdentifiedPeaksThreadSpecific) + out ChromatographicPeak bestAcceptor, + double? randomRt = null) { // get the MS1 scan info for this region so we can look up indexed peaks Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[idAcceptorFile]; Ms1ScanInfo start = ms1ScanInfos[0]; Ms1ScanInfo end = ms1ScanInfos[ms1ScanInfos.Length - 1]; + double rtStartHypothesis = randomRt == null ? rtInfo.RtStartHypothesis : (double)randomRt - (rtInfo.Width / 2.0); + double rtEndHypothesis = randomRt == null ? rtInfo.RtEndHypothesis : (double)randomRt + (rtInfo.Width / 2.0); for (int j = 0; j < ms1ScanInfos.Length; j++) { Ms1ScanInfo scan = ms1ScanInfos[j]; - if (scan.RetentionTime <= rtInfo.RtStartHypothesis) + if (scan.RetentionTime <= rtStartHypothesis) { start = scan; } - if (scan.RetentionTime >= rtInfo.RtEndHypothesis) + if (scan.RetentionTime >= rtEndHypothesis) { end = scan; break; @@ -930,6 +1227,7 @@ internal void FindAllAcceptorPeaks( } Identification donorIdentification = donorPeak.Identifications.First(); + bestAcceptor = null; foreach (int z in chargesToMatch) { @@ -951,36 +1249,13 @@ internal void FindAllAcceptorPeaks( while (chargeEnvelopes.Any()) { ChromatographicPeak acceptorPeak = FindIndividualAcceptorPeak(idAcceptorFile, scorer, donorPeak, - fileSpecificTol, rtInfo, z, chargeEnvelopes); + fileSpecificTol, rtInfo, z, chargeEnvelopes, randomRt); if (acceptorPeak == null) - continue; - - // save the peak hypothesis - if (matchBetweenRunsIdentifiedPeaksThreadSpecific.TryGetValue(donorIdentification.ModifiedSequence, out var mbrPeaks)) - { - if (mbrPeaks.TryGetValue(acceptorPeak.Apex, out List existing)) - { - var samePeakSameSequence = existing - .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); - - if (samePeakSameSequence != null) - { - samePeakSameSequence.Identifications.Add(donorIdentification); - } - else - { - existing.Add(acceptorPeak); - } - } - else - { - mbrPeaks.Add(acceptorPeak.Apex, new List { acceptorPeak }); - } - } - else + continue; + if (bestAcceptor == null || bestAcceptor.MbrScore < acceptorPeak.MbrScore) { - matchBetweenRunsIdentifiedPeaksThreadSpecific.Add(donorIdentification.ModifiedSequence, new Dictionary>()); - matchBetweenRunsIdentifiedPeaksThreadSpecific[donorIdentification.ModifiedSequence].Add(acceptorPeak.Apex, new List { acceptorPeak }); + acceptorPeak.ChargeList = chargesToMatch; + bestAcceptor = acceptorPeak; } } } @@ -1004,10 +1279,11 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( Tolerance mbrTol, RtInfo rtInfo, int z, - List chargeEnvelopes) + List chargeEnvelopes, + double? randomRt = null) { - var donorId = donorPeak.Identifications.First(); - var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile, predictedRetentionTime: rtInfo.PredictedRt); + var donorId = donorPeak.Identifications.OrderBy(p => p.QValue).First(); + var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile, randomRt != null); // Grab the first scan/envelope from charge envelopes. This should be the most intense envelope in the list IsotopicEnvelope seedEnv = chargeEnvelopes.First(); @@ -1030,7 +1306,7 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( return null; } - acceptorPeak.CalculateMbrScore(scorer, donorPeak); + acceptorPeak.MbrScore = scorer.ScoreMbr(acceptorPeak, donorPeak, randomRt ?? rtInfo.PredictedRt); return acceptorPeak; } @@ -1076,9 +1352,9 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) { if (!tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak) { - if (PeptidesModifiedSequencesToQuantify.Contains(tryPeak.Identifications.First().ModifiedSequence)) + if (PeptideModifiedSequencesToQuantify.Contains(tryPeak.Identifications.First().ModifiedSequence)) { - if (PeptidesModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence)) + if (PeptideModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence)) { storedPeak.MergeFeatureWith(tryPeak, Integrate); } @@ -1096,14 +1372,20 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) } else if (tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak) { - if(PeptidesModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence)) + // Default to MSMS peaks over MBR Peaks. + // Most of these have already been eliminated + // However, sometimes merging MBR peaks with different charge states reveals that + // The MBR peak conflicts with an MSMS peak + // Removing the peak when this happens is a conservative step. + // Sometimes the MSMS peak is a decoy, or has a peptides level Q-value < 0.01 (i.e., the modified sequence isn't in PeptideModifiedSequencesToQuantify). + // In this case, we keep the MBR peak. + if (storedPeak.DecoyPeptide || !PeptideModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence)) { - continue; + errorCheckedPeaksGroupedByApex[tryPeak.Apex.IndexedPeak] = tryPeak; } else { - // If the stored peak id isn't in the list of peptides to quantify, overwrite it - errorCheckedPeaksGroupedByApex[tryPeak.Apex.IndexedPeak] = tryPeak; + continue; } } else if (tryPeak.IsMbrPeak && storedPeak.IsMbrPeak) @@ -1129,6 +1411,140 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) _results.Peaks[spectraFile] = errorCheckedPeaks; } + private bool RunPEPAnalysis() + { + List mbrPeaks = _results.Peaks.SelectMany(kvp => kvp.Value) + .Where(peak => peak.IsMbrPeak) + .OrderByDescending(peak => peak.MbrScore) + .ToList(); + + if (!mbrPeaks.IsNotNullOrEmpty()) return false; + int decoyPeakTotal = mbrPeaks.Count(peak => peak.RandomRt); + + List tempPepQs = new(); + List tempQs = new(); + if (mbrPeaks.Count > 100 && decoyPeakTotal > 20) + { + PepAnalysisEngine pepAnalysisEngine = new PepAnalysisEngine(mbrPeaks, + outputFolder: Path.GetDirectoryName(_spectraFileInfo.First().FullFilePathWithExtension), + maxThreads: MaxThreads, + pepTrainingFraction: PepTrainingFraction); + var pepOutput = pepAnalysisEngine.ComputePEPValuesForAllPeaks(); + + _results.PepResultString = pepOutput; + + return true; + } + return false; + } + + /// + /// Calculates the FDR for each MBR-detected peak using decoy peaks and decoy peptides, + /// Then filters out all peaks below a given FDR threshold + /// + private void CalculateFdrForMbrPeaks(SpectraFileInfo acceptorFile, bool usePep) + { + List mbrPeaks; + if (usePep) + { + // Take only the top scoring acceptor for each donor (acceptor can be target or decoy!) + // Maybe we're sorting twice when we don't have to but idk if order is preserved using group by + mbrPeaks = _results.Peaks[acceptorFile] + .Where(peak => peak.IsMbrPeak) + .GroupBy(peak => peak.Identifications.First()) + .Select(group => group.OrderBy(peak => peak.MbrPep).ThenByDescending(peak => peak.MbrScore).First()) + .OrderBy(peak => peak.MbrPep) + .ThenByDescending(peak => peak.MbrScore) + .ToList(); + + _results.Peaks[acceptorFile] = mbrPeaks.Concat(_results.Peaks[acceptorFile].Where(peak => !peak.IsMbrPeak)).ToList(); + } + else + { + // If PEP wasn't performed, things probably aren't calibrated very well, and so it's better + // To err on the safe side and not remove the decoys + mbrPeaks = _results.Peaks[acceptorFile] + .Where(peak => peak.IsMbrPeak) + .OrderByDescending(peak => peak.MbrScore) + .ToList(); + } + + if (!mbrPeaks.IsNotNullOrEmpty()) return; + + List tempQs = new(); + int totalPeaks = 0; + int decoyPeptides = 0; + int decoyPeaks = 0; + int doubleDecoys = 0; + for (int i = 0; i < mbrPeaks.Count; i++) + { + totalPeaks++; + switch (mbrPeaks[i]) + { + case ChromatographicPeak p when (!p.DecoyPeptide && !p.RandomRt): + break; + case ChromatographicPeak p when (p.DecoyPeptide && !p.RandomRt): + decoyPeptides++; + break; + case ChromatographicPeak p when (!p.DecoyPeptide && p.RandomRt): + decoyPeaks++; + break; + case ChromatographicPeak p when (p.DecoyPeptide && p.RandomRt): + doubleDecoys++; + break; + } + + // There are two parts to this score. We're summing the PEPs of peaks derived from target peptides. For peaks derived from decoy peptides, + // We do the double decoy things where we count decoyPeptidePeaks - doubleDecoypeaks + tempQs.Add(Math.Round(EstimateFdr(doubleDecoys, decoyPeptides, decoyPeaks, totalPeaks), 6)); + } + + // Set the q-value for each peak + double[] correctedQs = CorrectQValues(tempQs); + for (int i = 0; i < correctedQs.Length; i++) + { + mbrPeaks[i].MbrQValue = correctedQs[i]; + } + } + + private int EstimateDecoyPeptideErrors(int decoyPeptideCount, int doubleDecoyCount) + { + return Math.Max(0, decoyPeptideCount - doubleDecoyCount); + } + + private double EstimateFdr(int doubleDecoyCount, int decoyPeptideCount, int decoyPeakCount, int totalPeakCount) + { + return (double)(1 + decoyPeakCount + EstimateDecoyPeptideErrors(decoyPeptideCount, doubleDecoyCount)) / totalPeakCount; + } + + /// + /// Standard q-value correction, ensures that in a list of temporary q-values, a q-value is equal to + /// Min(q-values, every q-value below in the list). As you work your way down a list of q-values, the value should only increase or stay the same. + /// + /// + /// + private double[] CorrectQValues(List tempQs) + { + if (!tempQs.IsNotNullOrEmpty()) return null; + double[] correctedQValues = new double[tempQs.Count]; + correctedQValues[tempQs.Count - 1] = tempQs.Last(); + for(int i = tempQs.Count-2; i >=0; i--) + { + if (tempQs[i] > correctedQValues[i+1]) + { + correctedQValues[i] = correctedQValues[i + 1]; + } + else + { + correctedQValues[i] = tempQs[i]; + } + } + + return correctedQValues; + } + + #endregion + /// /// Takes in a list of imsPeaks and finds all the isotopic peaks in each scan. If the experimental isotopic distribution /// matches the theoretical distribution, an IsotopicEnvelope object is created from the summed intensities of each isotopic peak. @@ -1228,7 +1644,7 @@ public List GetIsotopicEnvelopes( } // Check that the experimental envelope matches the theoretical - if (CheckIsotopicEnvelopeCorrelation(massShiftToIsotopePeaks, peak, chargeState, isotopeTolerance)) + if (CheckIsotopicEnvelopeCorrelation(massShiftToIsotopePeaks, peak, chargeState, isotopeTolerance, out var pearsonCorr)) { // impute unobserved isotope peak intensities // TODO: Figure out why value imputation is performed. Build a toggle? @@ -1240,7 +1656,7 @@ public List GetIsotopicEnvelopes( } } - isotopicEnvelopes.Add(new IsotopicEnvelope(peak, chargeState, experimentalIsotopeIntensities.Sum())); + isotopicEnvelopes.Add(new IsotopicEnvelope(peak, chargeState, experimentalIsotopeIntensities.Sum(), pearsonCorr)); } } @@ -1261,9 +1677,10 @@ public bool CheckIsotopicEnvelopeCorrelation( Dictionary> massShiftToIsotopePeaks, IndexedMassSpectralPeak peak, int chargeState, - Tolerance isotopeTolerance) + Tolerance isotopeTolerance, + out double pearsonCorrelation) { - double pearsonCorrelation = Correlation.Pearson( + pearsonCorrelation = Correlation.Pearson( massShiftToIsotopePeaks[0].Select(p => p.expIntensity), massShiftToIsotopePeaks[0].Select(p => p.theorIntensity)); diff --git a/mzLib/FlashLFQ/Identification.cs b/mzLib/FlashLFQ/Identification.cs index 85c557c3e..2ee9bd1b4 100644 --- a/mzLib/FlashLFQ/Identification.cs +++ b/mzLib/FlashLFQ/Identification.cs @@ -15,11 +15,15 @@ public class Identification public readonly ChemicalFormula OptionalChemicalFormula; public readonly bool UseForProteinQuant; public double PeakfindingMass; + public double PsmScore { get; init; } + public double QValue { get; init; } + public bool IsDecoy { get; } public Identification(SpectraFileInfo fileInfo, string BaseSequence, string ModifiedSequence, double monoisotopicMass, double ms2RetentionTimeInMinutes, int chargeState, List proteinGroups, - ChemicalFormula optionalChemicalFormula = null, bool useForProteinQuant = true) + ChemicalFormula optionalChemicalFormula = null, bool useForProteinQuant = true, + double psmScore = 0, double qValue = 0, bool decoy = false) { this.FileInfo = fileInfo; this.BaseSequence = BaseSequence; @@ -29,7 +33,10 @@ public Identification(SpectraFileInfo fileInfo, string BaseSequence, string Modi this.PrecursorChargeState = chargeState; this.ProteinGroups = new HashSet(proteinGroups); this.OptionalChemicalFormula = optionalChemicalFormula; - UseForProteinQuant = useForProteinQuant; + UseForProteinQuant = !decoy && useForProteinQuant; // ensure that decoy peptides aren't used for protein quant + QValue = qValue; + PsmScore = psmScore; + IsDecoy = decoy; } public override string ToString() diff --git a/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs b/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs index c9aa89042..cdadc56eb 100644 --- a/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs +++ b/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs @@ -29,7 +29,7 @@ public override bool Equals(object obj) public override int GetHashCode() { - return Mz.GetHashCode(); + return HashCode.Combine(Mz, ZeroBasedMs1ScanIndex); } public override string ToString() diff --git a/mzLib/FlashLFQ/IsotopicEnvelope.cs b/mzLib/FlashLFQ/IsotopicEnvelope.cs index 09d7207d7..938ac7850 100644 --- a/mzLib/FlashLFQ/IsotopicEnvelope.cs +++ b/mzLib/FlashLFQ/IsotopicEnvelope.cs @@ -11,11 +11,12 @@ public class IsotopicEnvelope public readonly IndexedMassSpectralPeak IndexedPeak; public readonly int ChargeState; - public IsotopicEnvelope(IndexedMassSpectralPeak monoisotopicPeak, int chargeState, double intensity) + public IsotopicEnvelope(IndexedMassSpectralPeak monoisotopicPeak, int chargeState, double intensity, double pearsonCorrelation) { IndexedPeak = monoisotopicPeak; ChargeState = chargeState; Intensity = intensity / chargeState; + PearsonCorrelation = pearsonCorrelation; } /// @@ -25,6 +26,9 @@ public IsotopicEnvelope(IndexedMassSpectralPeak monoisotopicPeak, int chargeStat /// public double Intensity { get; private set; } + + public double PearsonCorrelation { get; init; } + public void Normalize(double normalizationFactor) { Intensity *= normalizationFactor; diff --git a/mzLib/FlashLFQ/MbrScorer.cs b/mzLib/FlashLFQ/MbrScorer.cs index a703cf3b7..72c2ee72d 100644 --- a/mzLib/FlashLFQ/MbrScorer.cs +++ b/mzLib/FlashLFQ/MbrScorer.cs @@ -1,8 +1,10 @@ -using MathNet.Numerics.Distributions; +using Easy.Common.EasyComparer; +using MathNet.Numerics.Distributions; using MathNet.Numerics.Statistics; using System; using System.Collections.Generic; using System.Data; +using System.Data.Entity.ModelConfiguration.Conventions; using System.Linq; namespace FlashLFQ @@ -13,16 +15,18 @@ namespace FlashLFQ /// internal class MbrScorer { - internal SpectraFileInfo AcceptorFile { get; init; } // Intensity and ppm distributions are specific to each acceptor file private readonly Normal _logIntensityDistribution; private readonly Normal _ppmDistribution; private readonly Normal _scanCountDistribution; + private readonly Gamma _isotopicCorrelationDistribution; // The logFcDistributions and rtDifference distributions are unique to each donor file - acceptor file pair private Dictionary _logFcDistributionDictionary; private Dictionary _rtPredictionErrorDistributionDictionary; + internal Dictionary ApexToAcceptorFilePeakDict { get; } - internal List UnambiguousMsMsPeaks { get; } + internal List UnambiguousMsMsAcceptorPeaks { get; } + internal double MaxNumberOfScansObserved { get; } /// /// Takes in an intensity distribution, a log foldchange distribution, and a ppm distribution @@ -30,32 +34,47 @@ internal class MbrScorer /// internal MbrScorer( Dictionary apexToAcceptorFilePeakDict, - List acceptorPeaks, + List acceptorFileMsmsPeaks, Normal ppmDistribution, Normal logIntensityDistribution) { - AcceptorFile = acceptorPeaks.First().SpectraFileInfo; ApexToAcceptorFilePeakDict = apexToAcceptorFilePeakDict; - UnambiguousMsMsPeaks = acceptorPeaks.Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1).ToList(); + UnambiguousMsMsAcceptorPeaks = acceptorFileMsmsPeaks.Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1).ToList(); + MaxNumberOfScansObserved = acceptorFileMsmsPeaks.Max(peak => peak.ScanCount); _logIntensityDistribution = logIntensityDistribution; _ppmDistribution = ppmDistribution; + _isotopicCorrelationDistribution = GetIsotopicEnvelopeCorrDistribution(); _logFcDistributionDictionary = new(); _rtPredictionErrorDistributionDictionary = new(); // This is kludgey, because scan counts are discrete - List scanList = acceptorPeaks.Select(peak => (double)peak.ScanCount).ToList(); + List scanList = UnambiguousMsMsAcceptorPeaks.Select(peak => (double)peak.ScanCount).ToList(); // build a normal distribution for the scan list of the acceptor peaks - // InterQuartileRange / 1.35 = StandardDeviation for a normal distribution _scanCountDistribution = new Normal(scanList.Average(), scanList.Count > 30 ? scanList.StandardDeviation() : scanList.InterquartileRange() / 1.36); } + /// + /// This distribution represents (1 - Pearson Correlation) for isotopic envelopes of MS/MS acceptor peaks + /// + /// + private Gamma GetIsotopicEnvelopeCorrDistribution() + { + var pearsonCorrs = UnambiguousMsMsAcceptorPeaks.Select(p => 1 - p.IsotopicPearsonCorrelation).Where(p => p > 0).ToList(); + if (pearsonCorrs.Count <= 1) return null; + double mean = pearsonCorrs.Mean(); + double variance = pearsonCorrs.Variance(); + var alpha = Math.Pow(mean, 2) / variance; + var beta = mean / variance; + return new Gamma(alpha, beta); + } + /// /// Takes in a list of retention time differences for anchor peptides (donor RT - acceptor RT) and uses /// this list to calculate the distribution of prediction errors of the local RT alignment strategy employed by /// match-between-runs for the specified donor file /// /// List of retention time differences (doubles) calculated as donor file RT - acceptor file RT - internal void AddRtPredErrorDistribution(SpectraFileInfo donorFile, List anchorPeptideRtDiffs, int numberOfAnchorPeptidesPerSide) + internal void AddRtPredErrorDistribution(SpectraFileInfo donorFile, List anchorPeptideRtDiffs, int numberOfAnchorPeptides) { // in MBR, we use anchor peptides on either side of the donor to predict the retention time // here, we're going to repeat the same process, using neighboring anchor peptides to predicte the Rt shift for each @@ -66,40 +85,88 @@ internal void AddRtPredErrorDistribution(SpectraFileInfo donorFile, List double cumSumRtDiffs; List rtPredictionErrors = new(); - for (int i = numberOfAnchorPeptidesPerSide; i < anchorPeptideRtDiffs.Count - (numberOfAnchorPeptidesPerSide) ; i++) + for (int i = numberOfAnchorPeptides; i < (anchorPeptideRtDiffs.Count - numberOfAnchorPeptides); i++) { cumSumRtDiffs = 0; - for(int j = 1; j <= numberOfAnchorPeptidesPerSide; j++) + for(int j = 1; j <= numberOfAnchorPeptides; j++) { cumSumRtDiffs += anchorPeptideRtDiffs[i - j]; cumSumRtDiffs += anchorPeptideRtDiffs[i + j]; } - double avgDiff = cumSumRtDiffs / (2 * numberOfAnchorPeptidesPerSide); + double avgDiff = cumSumRtDiffs / (2 * numberOfAnchorPeptides); rtPredictionErrors.Add(avgDiff - anchorPeptideRtDiffs[i]); } - if(!rtPredictionErrors.Any()) + Normal rtPredictionErrorDist = new Normal(0, 0); + // Default distribution. Effectively assigns a RT Score of zero if no alignment can be performed + // between the donor and acceptor based on shared MS/MS IDs + + if(rtPredictionErrors.Any()) { - _rtPredictionErrorDistributionDictionary.Add(donorFile, new Normal(0, 1)); - return; + double medianRtError = rtPredictionErrors.Median(); + double stdDevRtError = rtPredictionErrors.StandardDeviation(); + if(stdDevRtError >= 0.0 && !double.IsNaN(medianRtError)) + { + rtPredictionErrorDist = new Normal(medianRtError, 1); + } } + + _rtPredictionErrorDistributionDictionary.Add(donorFile, rtPredictionErrorDist); + } - double medianRtError = rtPredictionErrors.Median(); - double stdDevRtError = rtPredictionErrors.StandardDeviation(); - - _rtPredictionErrorDistributionDictionary.Add(donorFile, new Normal(medianRtError, stdDevRtError)); + /// + /// Takes in a list of retention time differences for anchor peptides (donor RT - acceptor RT) and uses + /// this list to calculate the distribution of prediction errors of the local RT alignment strategy employed by + /// match-between-runs for the specified donor file + /// + /// An MBR Score ranging between 0 and 100. Higher scores are better. + internal double ScoreMbr(ChromatographicPeak acceptorPeak, ChromatographicPeak donorPeak, double predictedRt) + { + acceptorPeak.IntensityScore = CalculateIntensityScore(acceptorPeak.Intensity, donorPeak); + acceptorPeak.RtPredictionError = predictedRt - acceptorPeak.ApexRetentionTime; + acceptorPeak.RtScore = CalculateScore(_rtPredictionErrorDistributionDictionary[donorPeak.SpectraFileInfo], + acceptorPeak.RtPredictionError); + acceptorPeak.PpmScore = CalculateScore(_ppmDistribution, acceptorPeak.MassError); + acceptorPeak.ScanCountScore = CalculateScore(_scanCountDistribution, acceptorPeak.ScanCount); + acceptorPeak.IsotopicDistributionScore = CalculateScore(_isotopicCorrelationDistribution, 1 - acceptorPeak.IsotopicPearsonCorrelation); + + // Returns 100 times the geometric mean of the four scores (scan count, intensity score, rt score, ppm score) + return 100 * Math.Pow(acceptorPeak.IntensityScore + * acceptorPeak.RtScore + * acceptorPeak.PpmScore + * acceptorPeak.ScanCountScore + * acceptorPeak.IsotopicDistributionScore, 0.20); } - private double CalculateScore(Normal distribution, double value) + // Setting a minimum score prevents the MBR score from going to zero if one component of that score is 0 + // 3e-7 is the fraction of a normal distribution that lies at least 5 stdDev away from the mean + private double _minScore = 3e-7; + + internal double CalculateScore(Normal distribution, double value) { + // new method double absoluteDiffFromMean = Math.Abs(distribution.Mean - value); // Returns a value between (0, 1] where 1 means the value was equal to the distribution mean - return 2 * distribution.CumulativeDistribution(distribution.Mean - absoluteDiffFromMean); + // The score represents the fraction of the distribution that lies absoluteDiffFromMean away from the mean or further + // i.e., what fraction of the distribution is more extreme than value + double score = 2 * distribution.CumulativeDistribution(distribution.Mean - absoluteDiffFromMean); + return (double.IsNaN(score) || score == 0) ? _minScore : score; } - internal double CalculateIntensityScore(ChromatographicPeak acceptorPeak, ChromatographicPeak donorPeak) + internal double CalculateScore(Gamma distribution, double value) + { + if (value < 0 || distribution == null) + { + return _minScore; + } + + // For the gamma distribtuion, the CDF is 0 when the pearson correlation is equal to 1 (value = 0) + // The CDF then rapidly rises, reaching ~1 at a value of 0.3 (corresponding to a pearson correlation of 0.7) + return 1 - distribution.CumulativeDistribution(value); + } + + internal double CalculateIntensityScore(double acceptorIntensity, ChromatographicPeak donorPeak) { - double acceptorIntensity = acceptorPeak.Intensity; if (donorPeak != null && acceptorIntensity != 0 && donorPeak.Intensity != 0 && _logFcDistributionDictionary.TryGetValue(donorPeak.SpectraFileInfo, out var logFcDistribution)) { @@ -111,38 +178,7 @@ internal double CalculateIntensityScore(ChromatographicPeak acceptorPeak, Chroma var logIntensity = Math.Log(acceptorIntensity, 2); return CalculateScore(_logIntensityDistribution, logIntensity); } - } - /// - /// Calculates the retention time score for a given MbrAcceptor by comparing to the - /// distribution of all retention time prediction errors for all anchor peptides shared between - /// the donor and acceptor files - /// - /// Score bounded by 0 and 1, where higher scores are better - internal double CalculateRetentionTimeScore(ChromatographicPeak acceptorPeak, ChromatographicPeak donorPeak) - { - double rtPredictionError = acceptorPeak.PredictedRetentionTime - acceptorPeak.ApexRetentionTime; - return CalculateScore(_rtPredictionErrorDistributionDictionary[donorPeak.SpectraFileInfo], rtPredictionError); - } - - /// - /// Calculates the Ppm error score for a given acceptor by comparing the ppm error for the given peak - /// to the ppm error of all non-MBR peaks in the acceptor file - /// - /// Score bounded by 0 and 1, where higher scores are better - internal double CalculatePpmErrorScore(ChromatographicPeak acceptorPeak) - { - return CalculateScore(_ppmDistribution, acceptorPeak.MassError); - } - - /// - /// Calculates the scan count score for a given acceptor by comparing the number of scans observed for the given peak - /// to the ppm error of all non-MBR peaks in the acceptor file - /// - /// Score bounded by 0 and 1, where higher scores are better - internal double CalculateScanCountScore(ChromatographicPeak acceptorPeak) - { - return CalculateScore(_scanCountDistribution, acceptorPeak.ScanCount); } /// @@ -162,7 +198,7 @@ internal void CalculateFoldChangeBetweenFiles(List idDonorP var acceptorFileBestMsmsPeaks = new Dictionary(); // get the best (most intense) peak for each peptide in the acceptor file - foreach (ChromatographicPeak acceptorPeak in UnambiguousMsMsPeaks) + foreach (ChromatographicPeak acceptorPeak in UnambiguousMsMsAcceptorPeaks) { if (acceptorFileBestMsmsPeaks.TryGetValue(acceptorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) { @@ -199,5 +235,18 @@ internal void CalculateFoldChangeBetweenFiles(List idDonorP } } + /// + /// Determines whether or not the scorer is validly paramaterized and capable + /// of scoring MBR transfers originating from the given donorFile + /// + internal bool IsValid(SpectraFileInfo donorFile) + { + return _rtPredictionErrorDistributionDictionary.TryGetValue(donorFile, out var rtDist) + && rtDist != null + && _ppmDistribution != null + && _scanCountDistribution != null + && _logIntensityDistribution != null; + } + } } diff --git a/mzLib/FlashLFQ/PEP/ChromatographicPeakData.cs b/mzLib/FlashLFQ/PEP/ChromatographicPeakData.cs new file mode 100644 index 000000000..fbb8f429d --- /dev/null +++ b/mzLib/FlashLFQ/PEP/ChromatographicPeakData.cs @@ -0,0 +1,106 @@ +using Easy.Common.Extensions; +using Microsoft.ML.Data; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Text; + +namespace FlashLFQ.PEP +{ + public class ChromatographicPeakData + { + public static readonly IImmutableDictionary trainingInfos = new Dictionary + { + { "standard", new [] + { + "PpmErrorScore", + "IntensityScore", + "RtScore", + "ScanCountScore", + "IsotopicDistributionScore", + "PpmErrorRaw", + "IntensityRaw", + "RtPredictionErrorRaw", + "ScanCountRaw", + "IsotopicPearsonCorrelation" + } + }, + { "reduced", new [] + { + "PpmErrorRaw", + "IntensityRaw", + "RtPredictionErrorRaw", + "ScanCountRaw", + "IsotopicPearsonCorrelation" + } + }, + }.ToImmutableDictionary(); + + /// + /// These are used for percolator. Trainer must be told the assumed direction for each attribute as it relates to being a true positive + /// Here, a weight of 1 indicates that the probability of being true is for higher numbers in the set. + /// A weight of -1 indicates that the probability of being true is for the lower numbers in the set. + /// + public static readonly IImmutableDictionary assumedAttributeDirection = new Dictionary { + { "PpmErrorScore", 1 }, + { "IntensityScore", 1 }, + { "RtScore", 1 }, + { "ScanCountScore", 1 }, + { "IsotopicDistributionScore", 1 }, + { "PpmErrorRaw", -1 }, + { "IntensityRaw", 1 }, + { "RtPredictionErrorRaw", -1 }, + { "ScanCountRaw", -1 }, + { "IsotopicPearsonCorrelation", 1 } + }.ToImmutableDictionary(); + + public string ToString(string searchType) + { + StringBuilder sb = new StringBuilder(); + var variablesToOutput = ChromatographicPeakData.trainingInfos[searchType]; + + foreach (var variable in variablesToOutput) + { + var property = typeof(ChromatographicPeakData).GetProperty(variable).GetValue(this, null); + var floatValue = (float)property; + sb.Append("\t"); + sb.Append(floatValue.ToString()); + } + + return sb.ToString(); + } + + [LoadColumn(0)] + public float PpmErrorScore { get; set; } + + [LoadColumn(1)] + public float IntensityScore { get; set; } + + [LoadColumn(2)] + public float RtScore { get; set; } + + [LoadColumn(3)] + public float ScanCountScore { get; set; } + + [LoadColumn(4)] + public float IsotopicDistributionScore { get; set; } + + [LoadColumn(5)] + public float PpmErrorRaw { get; set; } + + [LoadColumn(6)] + public float IntensityRaw { get; set; } + + [LoadColumn(7)] + public float RtPredictionErrorRaw { get; set; } + + [LoadColumn(8)] + public float ScanCountRaw { get; set; } + + [LoadColumn(9)] + public float IsotopicPearsonCorrelation { get; set; } + + [LoadColumn(10)] + public bool Label { get; set; } + + } +} \ No newline at end of file diff --git a/mzLib/FlashLFQ/PEP/DonorGroup.cs b/mzLib/FlashLFQ/PEP/DonorGroup.cs new file mode 100644 index 000000000..351bdee90 --- /dev/null +++ b/mzLib/FlashLFQ/PEP/DonorGroup.cs @@ -0,0 +1,42 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FlashLFQ.PEP +{ + /// + /// This class represents a group of chromatographic peaks that are associated with a donor identification. + /// During MBR, one donor identification is associated with multiple acceptor identifications, with both + /// predicted retention times (good MBR transfers) and random retention times (decoy MBR transfers). + /// This class groups them together for the purpose of cross-validation/PEP scoring + /// + public class DonorGroup : IEnumerable + { + public Identification DonorId { get; } + public List TargetAcceptors { get; } + public List DecoyAcceptors { get; } + + public DonorGroup(Identification donorId, List targetAcceptors, List decoyAcceptors) + { + DonorId = donorId; + TargetAcceptors = targetAcceptors; + DecoyAcceptors = decoyAcceptors; + } + + public double BestTargetMbrScore => TargetAcceptors.Count == 0 ? 0 : TargetAcceptors.Max(acceptor => acceptor.MbrScore); + + public IEnumerator GetEnumerator() + { + return TargetAcceptors.Concat(DecoyAcceptors).GetEnumerator(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + + } +} diff --git a/mzLib/FlashLFQ/PEP/PepAnalysisEngine.cs b/mzLib/FlashLFQ/PEP/PepAnalysisEngine.cs new file mode 100644 index 000000000..eaddae3e8 --- /dev/null +++ b/mzLib/FlashLFQ/PEP/PepAnalysisEngine.cs @@ -0,0 +1,647 @@ +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Omics; +using System.Collections; +using System.Security.Policy; +using System.Text.RegularExpressions; +using System.Reflection; + +namespace FlashLFQ.PEP +{ + public class PepAnalysisEngine + { + public double PipScoreCutoff; + + private static int _randomSeed = 42; + + /// + /// This method contains the hyper-parameters that will be used when training the machine learning model + /// + /// Options object to be passed in to the FastTree constructor + public FastTreeBinaryTrainer.Options BGDTreeOptions => + new FastTreeBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 100, + MinimumExampleCountPerLeaf = 10, + NumberOfLeaves = 20, + LearningRate = 0.2, + LabelColumnName = "Label", + FeatureColumnName = "Features", + Seed = _randomSeed, + FeatureSelectionSeed = _randomSeed, + RandomStart = false, + UnbalancedSets = true + }; + + public List Peaks { get; } + public string OutputFolder { get; set; } + public int MaxThreads { get; set; } + public double PepTrainingFraction { get; set; } + + public PepAnalysisEngine(List peaks, string outputFolder, int maxThreads, double pepTrainingFraction = 0.25) + { + Peaks = peaks; + OutputFolder = outputFolder; + MaxThreads = maxThreads; + PepTrainingFraction = pepTrainingFraction; + } + + public string ComputePEPValuesForAllPeaks() + { + string[] trainingVariables = ChromatographicPeakData.trainingInfos["standard"]; + + #region Construct Donor Groups + // this is target peak not target peptide + List donors= new(); + foreach(var donorGroup in Peaks + .Where(peak => peak.IsMbrPeak) + .OrderByDescending(peak => peak.MbrScore) + .GroupBy(peak => peak.Identifications.First())) //Group by donor peptide + { + var donorId = donorGroup.Key; + var targetAcceptors = donorGroup.Where(peak => !peak.RandomRt).ToList(); + var decoyAcceptors = donorGroup.Where(peak => peak.RandomRt).ToList(); + donors.Add(new DonorGroup(donorId, targetAcceptors, decoyAcceptors)); + } + + // Fix the order + donors = OrderDonorGroups(donors); + + var peakScores = donors.SelectMany(donor => donor.Select(p => p.MbrScore)).OrderByDescending(score => score).ToList(); + PipScoreCutoff = peakScores[(int)Math.Floor(peakScores.Count * PepTrainingFraction)]; //Select the top N percent of all peaks, only use those as positive examples + + MLContext mlContext = new MLContext(_randomSeed); + //the number of groups used for cross-validation is hard-coded at three. Do not change this number without changing other areas of effected code. + const int numGroups = 3; + + List[] donorGroupIndices = GetDonorGroupIndices(donors, numGroups, PipScoreCutoff); + + #endregion + + #region Create Groups and Model + IEnumerable[] ChromatographicPeakDataGroups = new IEnumerable[numGroups]; + for (int i = 0; i < numGroups; i++) + { + ChromatographicPeakDataGroups[i] = CreateChromatographicPeakData(donors, donorGroupIndices[i], MaxThreads); + + if (!ChromatographicPeakDataGroups[i].Any(p => p.Label == true) + || !ChromatographicPeakDataGroups[i].Any(p => p.Label == false)) + { + return "Posterior error probability analysis failed. This can occur for small data sets when some sample groups are missing positive or negative training examples."; + } + } + + TransformerChain>>[] trainedModels + = new TransformerChain>>[numGroups]; + + var trainer = mlContext.BinaryClassification.Trainers.FastTree(BGDTreeOptions); + var pipeline = mlContext.Transforms.Concatenate("Features", trainingVariables) + .Append(trainer); + + List allMetrics = new List(); + + #endregion + + #region Training and Cross Validation First iteration + + for (int groupIndexNumber = 0; groupIndexNumber < numGroups; groupIndexNumber++) + { + + List allGroupIndexes = Enumerable.Range(0, numGroups).ToList(); + allGroupIndexes.RemoveAt(groupIndexNumber); + + //concat doesn't work in a loop, therefore I had to hard code the concat to group 3 out of 4 lists. if the const int numGroups value is changed, then the concat has to be changed accordingly. + IDataView dataView = mlContext.Data.LoadFromEnumerable( + ChromatographicPeakDataGroups[allGroupIndexes[0]] + .Concat(ChromatographicPeakDataGroups[allGroupIndexes[1]])); + + trainedModels[groupIndexNumber] = pipeline.Fit(dataView); + var myPredictions = trainedModels[groupIndexNumber].Transform(mlContext.Data.LoadFromEnumerable(ChromatographicPeakDataGroups[groupIndexNumber])); + CalibratedBinaryClassificationMetrics metrics = mlContext.BinaryClassification.Evaluate(data: myPredictions, labelColumnName: "Label", scoreColumnName: "Score"); + + //Parallel operation of the following code requires the method to be stored and then read, once for each thread + //if not output directory is specified, the model cannot be stored, and we must force single-threaded operation + if (OutputFolder != null) + { + mlContext.Model.Save(trainedModels[groupIndexNumber], dataView.Schema, Path.Combine(OutputFolder, "model.zip")); + } + + Compute_PEP_For_All_Peaks(donors, donorGroupIndices[groupIndexNumber], mlContext, trainedModels[groupIndexNumber], OutputFolder, MaxThreads); + + allMetrics.Add(metrics); + } + + #endregion + #region Iterative Training + + for(int trainingIteration = 0; trainingIteration < 9; trainingIteration++) + { + ChromatographicPeakDataGroups = new IEnumerable[numGroups]; + for (int i = 0; i < numGroups; i++) + { + ChromatographicPeakDataGroups[i] = CreateChromatographicPeakDataIteration(donors, donorGroupIndices[i], MaxThreads); + + if (!ChromatographicPeakDataGroups[i].Any(p => p.Label == true) + || !ChromatographicPeakDataGroups[i].Any(p => p.Label == false)) + { + return "Posterior error probability analysis failed. This can occur for small data sets when some sample groups are missing positive or negative training examples."; + } + } + + for (int groupIndexNumber = 0; groupIndexNumber < numGroups; groupIndexNumber++) + { + List allGroupIndexes = Enumerable.Range(0, numGroups).ToList(); + allGroupIndexes.RemoveAt(groupIndexNumber); + + IDataView dataView = mlContext.Data.LoadFromEnumerable( + ChromatographicPeakDataGroups[allGroupIndexes[0]] + .Concat(ChromatographicPeakDataGroups[allGroupIndexes[1]])); + + trainedModels[groupIndexNumber] = pipeline.Fit(dataView); + var myPredictions = trainedModels[groupIndexNumber].Transform(mlContext.Data.LoadFromEnumerable(ChromatographicPeakDataGroups[groupIndexNumber])); + CalibratedBinaryClassificationMetrics metrics = mlContext.BinaryClassification.Evaluate(data: myPredictions, labelColumnName: "Label", scoreColumnName: "Score"); + + //Parallel operation of the following code requires the method to be stored and then read, once for each thread + //if not output directory is specified, the model cannot be stored, and we must force single-threaded operation + if (OutputFolder != null) + { + mlContext.Model.Save(trainedModels[groupIndexNumber], dataView.Schema, Path.Combine(OutputFolder, "model.zip")); + } + + Compute_PEP_For_All_Peaks(donors, donorGroupIndices[groupIndexNumber], mlContext, trainedModels[groupIndexNumber], OutputFolder, MaxThreads); + + allMetrics.Add(metrics); + } + } + #endregion + + return AggregateMetricsForOutput(allMetrics); + } + + public static List OrderDonorGroups(List donors) + { + return donors.OrderByDescending(donor => donor.TargetAcceptors.Count) + .ThenByDescending(donor => donor.DecoyAcceptors.Count) + .ThenByDescending(donor => donor.BestTargetMbrScore) + .ToList(); + } + + //we add the indexes of the targets and decoys to the groups separately in the hope that we'll get at least one target and one decoy in each group. + //then training can possibly be more successful. + public static List[] GetDonorGroupIndices(List donors, int numGroups, double scoreCutoff) + { + List[] groupsOfIndices = new List[numGroups]; + for (int i = 0; i < numGroups; i++) + { + groupsOfIndices[i] = new List(); + } + + int myIndex = 0; + + while (myIndex < donors.Count) + { + int subIndex = 0; + while (subIndex < numGroups && myIndex < donors.Count) + { + groupsOfIndices[subIndex].Add(myIndex); + + subIndex++; + myIndex++; + } + } + + EqualizeDonorGroupIndices(donors, groupsOfIndices, scoreCutoff, numGroups); + + return groupsOfIndices; + } + + /// + /// Equalizes partitions used for cross-validation. The goal is to have the same number of targets and decoys in each partition + /// + /// List of all DonorGroups to be classified + /// An array of lists. Each list contains the indices of donor groups for a given partition + /// The MBR Score cutoff that determines which MBR target peaks will be used as positive training examples + /// /// Number of groups used for cross-validation, default = 3 + public static void EqualizeDonorGroupIndices(List donors, List[] groupsOfIndices, double scoreCutoff, int numGroups = 3) + { + HashSet swappedDonors = new HashSet(); // Keep track of everything we've swapped so we don't swap it again + // Outer loop iterates over the groups of indices (partitions) three times + // after each inner loop iterations, the number of ttargtes and decoys in each adjacent group is equal, but commonly group 1 and 3 will have a different number + // of targets and decoys. Looping three times should resolve this + for (int i = 0; i < numGroups*3 - 1; i++) + { + int groupA = i % numGroups; + int groupB = (i + 1) % numGroups; + int targetsA = 0; + int targetsB = 0; + int decoysA = 0; + int decoysB = 0; + foreach (int index in groupsOfIndices[groupA]) + { + targetsA += donors[index].TargetAcceptors.Count(peak => peak.MbrScore >= scoreCutoff); + decoysA += donors[index].DecoyAcceptors.Count; + } + foreach (int index in groupsOfIndices[groupB]) + { + targetsB += donors[index].TargetAcceptors.Count(peak => peak.MbrScore >= scoreCutoff); + decoysB += donors[index].DecoyAcceptors.Count; + } + + bool stuck = false; + int outerIterations = 0; + int minIndex = groupsOfIndices[groupA].Min(); + + // Calculate the difference in targets and decoys between the two groups + int targetSurplus = targetsA - targetsB; + int decoySurplus = decoysA - decoysB; + + while ((Math.Abs(targetSurplus) > 1 | Math.Abs(decoySurplus) > 1) && !stuck && outerIterations < 3) + { + bool swapped = false; + outerIterations++; + + int innerIterations = 0; + // start from the bottom of group 1, trying to swap peaks. + // If group 1 has more targets than group 2, we want to swap groups to equalize the number of targets in each group + while (Math.Abs(targetSurplus) > 1 & !stuck & innerIterations < 3) + { + innerIterations++; + swapped = false; + // Traverse the list of donor indices in descending order, looking for a good candidate to swap + foreach (int donorIndexA in groupsOfIndices[groupA].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + int donorIndexATargetCount = donors[donorIndexA].TargetAcceptors.Count(peak => peak.MbrScore > scoreCutoff); + switch (targetSurplus > 0) + { + case true: // i.e., too many targets + if (donorIndexATargetCount < 1) continue; // No targets to swap + foreach (int donorIndexB in groupsOfIndices[groupB].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + if (donors[donorIndexB].TargetAcceptors.Count(peak => peak.MbrScore > scoreCutoff) < donorIndexATargetCount) + { + GroupSwap(donors, groupsOfIndices, donorIndexA, donorIndexB, groupA, groupB, + scoreCutoff, swappedDonors, ref targetSurplus, ref decoySurplus); + swapped = true; + break; + } + } + break; + case false: // i.e., too few targets + foreach (int donorIndexB in groupsOfIndices[groupB].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + if (donors[donorIndexB].TargetAcceptors.Count(peak => peak.MbrScore > scoreCutoff) > donorIndexATargetCount) + { + GroupSwap(donors, groupsOfIndices, donorIndexA, donorIndexB, groupA, groupB, + scoreCutoff, swappedDonors, ref targetSurplus, ref decoySurplus); + swapped = true; + break; + } + } + break; + } + + // If we reach the index of the list of donorGroups, set stuck to true so that the outer loop will break + if (donorIndexA == minIndex) + { + stuck = true; + break; + } + if (swapped) + break; + + } + } + + innerIterations = 0; + // Now we'll do the decoys + while (Math.Abs(decoySurplus) > 1 & !stuck & innerIterations < 3) + { + innerIterations++; + swapped = false; + foreach (int donorIndexA in groupsOfIndices[groupA].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + int donorIndexADecoyCount = donors[donorIndexA].DecoyAcceptors.Count(); + switch (decoySurplus > 0) + { + case true: // i.e., too many decoys + if (donorIndexADecoyCount < 1) continue; // No decoys to swap + foreach (int donorIndexB in groupsOfIndices[groupB].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + if (donors[donorIndexB].DecoyAcceptors.Count() < donorIndexADecoyCount) + { + GroupSwap(donors, groupsOfIndices, donorIndexA, donorIndexB, groupA, groupB, + scoreCutoff, swappedDonors, ref targetSurplus, ref decoySurplus); + swapped = true; + break; + } + } + break; + case false: // i.e., too few decoys + foreach (int donorIndexB in groupsOfIndices[groupB].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + if (donors[donorIndexB].DecoyAcceptors.Count() > donorIndexADecoyCount) + { + GroupSwap(donors, groupsOfIndices, donorIndexA, donorIndexB, groupA, groupB, + scoreCutoff, swappedDonors, ref targetSurplus, ref decoySurplus); + swapped = true; + break; + } + } + break; + } + + // If we reach the index of the list of donorGroups, set stuck to true so that the outer loop will break + if (donorIndexA == minIndex) + { + stuck = true; + break; + } + if (swapped) + break; + } + } + } + } + } + + /// + /// Takes in a list of donor groups and a list of indices for each group, and swaps two groups of indices + /// Updates the targetSurplus and decoySurplus variables + /// Updates the swappedDonors hash set to keep track of which donors have been swapped + /// This is done to equalize the number of targets and decoys in each paritition for cross validation + /// + public static void GroupSwap( + List donors, + List[] groupsOfIndices, + int donorIndexA, + int donorIndexB, + int groupsOfIndicesIndexA, + int groupsOfIndicesIndexB, + double scoreCutoff, + HashSet swappedDonors, + ref int targetSurplus, + ref int decoySurplus) + { + // Multiply by two because the surplus is the difference between the two groups + // So removing one peak from one group and adding it to the other group is a difference of two + targetSurplus += 2 * ( + donors[donorIndexB].TargetAcceptors.Count(peak => peak.MbrScore >= scoreCutoff) - + donors[donorIndexA].TargetAcceptors.Count(peak => peak.MbrScore >= scoreCutoff)); + decoySurplus += 2 * ( + donors[donorIndexB].DecoyAcceptors.Count - + donors[donorIndexA].DecoyAcceptors.Count); + + groupsOfIndices[groupsOfIndicesIndexA].Add(donorIndexB); + groupsOfIndices[groupsOfIndicesIndexA].Remove(donorIndexA); + groupsOfIndices[groupsOfIndicesIndexB].Add(donorIndexA); + groupsOfIndices[groupsOfIndicesIndexB].Remove(donorIndexB); + } + + /// + /// Creates chromatographic peak data that will be used to train the machine learning model + /// Classifies peaks as positive or negative training examples + /// Positive training examples are peaks with MBR scores above the 25th percentile, + /// Negative training examples are peaks with random retention times + /// + /// The list of donor groups. + /// The list of donor indices. + /// The maximum number of threads. + /// The enumerable of chromatographic peak data. + public IEnumerable CreateChromatographicPeakData(List donors, List donorIndices, int maxThreads) + { + object ChromatographicPeakDataListLock = new object(); + List ChromatographicPeakDataList = new List(); + int[] threads = Enumerable.Range(0, maxThreads).ToArray(); + + List pipScores = new(); + foreach(int i in donorIndices) + { + pipScores.AddRange(donors[i].Select(peak => peak.MbrScore)); + } + pipScores.Sort((a, b) => b.CompareTo(a)); // This is a descending sort + double groupSpecificPipScoreCutoff = pipScores[(int)Math.Floor(pipScores.Count * 0.25)]; + + Parallel.ForEach(Partitioner.Create(0, donorIndices.Count), + new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, + (range, loopState) => + { + List localChromatographicPeakDataList = new List(); + for (int i = range.Item1; i < range.Item2; i++) + { + var donor = donors[donorIndices[i]]; + foreach (var peak in donor) + { + ChromatographicPeakData newChromatographicPeakData = new ChromatographicPeakData(); + if (peak.RandomRt) + { + newChromatographicPeakData = CreateOneChromatographicPeakDataEntry(peak, label: false); + localChromatographicPeakDataList.Add(newChromatographicPeakData); + } + else if (!peak.RandomRt & peak.MbrScore >= groupSpecificPipScoreCutoff) + { + newChromatographicPeakData = CreateOneChromatographicPeakDataEntry(peak, label: true); + localChromatographicPeakDataList.Add(newChromatographicPeakData); + } + } + } + lock (ChromatographicPeakDataListLock) + { + ChromatographicPeakDataList.AddRange(localChromatographicPeakDataList); + } + }); + + ChromatographicPeakData[] pda = ChromatographicPeakDataList.ToArray(); + + return pda.AsEnumerable(); + } + + /// + /// Creates chromatographic peak data, but uses PEP values instead of MBR scores to select the positive training examples + /// + /// The list of donor groups. + /// The list of donor indices. + /// The maximum number of threads. + /// The enumerable of chromatographic peak data. + public IEnumerable CreateChromatographicPeakDataIteration(List donors, List donorIndices, int maxThreads) + { + object ChromatographicPeakDataListLock = new object(); + List ChromatographicPeakDataList = new List(); + int[] threads = Enumerable.Range(0, maxThreads).ToArray(); + + List peps = new(); + foreach (int i in donorIndices) + { + peps.AddRange(donors[i].Select(peak => peak.MbrPep ?? 1)); + } + peps.Sort(); + double groupSpecificPepCutoff = peps[(int)Math.Floor(peps.Count * 0.25)]; + + Parallel.ForEach(Partitioner.Create(0, donorIndices.Count), + new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, + (range, loopState) => + { + List localChromatographicPeakDataList = new List(); + for (int i = range.Item1; i < range.Item2; i++) + { + var donor = donors[donorIndices[i]]; + foreach (var peak in donor) + { + ChromatographicPeakData newChromatographicPeakData = new ChromatographicPeakData(); + if (peak.RandomRt) + { + newChromatographicPeakData = CreateOneChromatographicPeakDataEntry(peak, label: false); + localChromatographicPeakDataList.Add(newChromatographicPeakData); + } + else if (!peak.RandomRt & peak.MbrPep <= groupSpecificPepCutoff) + { + newChromatographicPeakData = CreateOneChromatographicPeakDataEntry(peak, label: true); + localChromatographicPeakDataList.Add(newChromatographicPeakData); + } + } + } + lock (ChromatographicPeakDataListLock) + { + ChromatographicPeakDataList.AddRange(localChromatographicPeakDataList); + } + }); + + ChromatographicPeakData[] pda = ChromatographicPeakDataList.ToArray(); + + return pda.AsEnumerable(); + } + + public static void Compute_PEP_For_All_Peaks( + List donors, + List donorIndices, + MLContext mLContext, + TransformerChain>> trainedModel, + string outputFolder, int maxThreads) + { + object lockObject = new object(); + + //the trained model is not threadsafe. Therefore, to use the same model for each thread saved the model to disk. Then each thread reads its own copy of the model back from disk. + //If there is no output folder specified, then this can't happen. We set maxthreads eqaul to one and use the model that gets passed into the method. + if (String.IsNullOrEmpty(outputFolder)) + { + maxThreads = 1; + } + + Parallel.ForEach(Partitioner.Create(0, donorIndices.Count), + new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, + (range, loopState) => + { + + ITransformer threadSpecificTrainedModel; + if (maxThreads == 1) + { + threadSpecificTrainedModel = trainedModel; + } + else + { + threadSpecificTrainedModel = mLContext.Model.Load(Path.Combine(outputFolder, "model.zip"), out DataViewSchema savedModelSchema); + } + + // one prediction engine per thread, because the prediction engine is not thread-safe + var threadPredictionEngine = mLContext.Model.CreatePredictionEngine(threadSpecificTrainedModel); + + for (int i = range.Item1; i < range.Item2; i++) + { + DonorGroup donor = donors[donorIndices[i]]; + + foreach(ChromatographicPeak peak in donor) + { + ChromatographicPeakData pd = CreateOneChromatographicPeakDataEntry(peak, label: !peak.RandomRt); + var pepValuePrediction = threadPredictionEngine.Predict(pd); + peak.MbrPep = 1 - pepValuePrediction.Probability; + } + } + }); + } + + public static string AggregateMetricsForOutput(List allMetrics) + { + List accuracy = allMetrics.Select(m => m.Accuracy).ToList(); + List areaUnderRocCurve = allMetrics.Select(m => m.AreaUnderRocCurve).ToList(); + List areaUnderPrecisionRecallCurve = allMetrics.Select(m => m.AreaUnderPrecisionRecallCurve).ToList(); + List F1Score = allMetrics.Select(m => m.F1Score).ToList(); + List logLoss = allMetrics.Select(m => m.LogLoss).ToList(); + List logLossReduction = allMetrics.Select(m => m.LogLossReduction).ToList(); + List positivePrecision = allMetrics.Select(m => m.PositivePrecision).ToList(); + List positiveRecall = allMetrics.Select(m => m.PositiveRecall).ToList(); + List negativePrecision = allMetrics.Select(m => m.NegativePrecision).ToList(); + List negativeRecall = allMetrics.Select(m => m.NegativeRecall).ToList(); + + // log-loss can stochastically take on a value of infinity. + // correspondingly, log-loss reduction can be negative infinity. + // when this happens for one or more of the metrics, it can lead to uninformative numbers. + // so, unless they are all infinite, we remove them from the average. If they are all infinite, we report that. + + logLoss.RemoveAll(x => x == Double.PositiveInfinity); + logLossReduction.RemoveAll(x => x == Double.NegativeInfinity); + + double logLossAverage = Double.PositiveInfinity; + double logLossReductionAverage = Double.NegativeInfinity; + + if ((logLoss != null) && (logLoss.Any())) + { + logLossAverage = logLoss.Average(); + } + + if ((logLossReduction != null) && (logLossReduction.Any())) + { + logLossReductionAverage = logLossReduction.Average(); + } + + StringBuilder s = new StringBuilder(); + s.AppendLine(); + s.AppendLine("************************************************************"); + s.AppendLine("* Metrics for Determination of PEP Using Binary Classification "); + s.AppendLine("*-----------------------------------------------------------"); + s.AppendLine("* Accuracy: " + accuracy.Average().ToString()); + s.AppendLine("* Area Under Curve: " + areaUnderRocCurve.Average().ToString()); + s.AppendLine("* Area under Precision recall Curve: " + areaUnderPrecisionRecallCurve.Average().ToString()); + s.AppendLine("* F1Score: " + F1Score.Average().ToString()); + s.AppendLine("* LogLoss: " + logLossAverage.ToString()); + s.AppendLine("* LogLossReduction: " + logLossReductionAverage.ToString()); + s.AppendLine("* PositivePrecision: " + positivePrecision.Average().ToString()); + s.AppendLine("* PositiveRecall: " + positiveRecall.Average().ToString()); + s.AppendLine("* NegativePrecision: " + negativePrecision.Average().ToString()); + s.AppendLine("* NegativeRecall: " + negativeRecall.Average().ToString()); + s.AppendLine("************************************************************"); + return s.ToString(); + } + + public static ChromatographicPeakData CreateOneChromatographicPeakDataEntry(ChromatographicPeak peak,bool label) + { + + peak.PepPeakData = new ChromatographicPeakData + { + PpmErrorScore = (float)peak.PpmScore, + IntensityScore = (float)peak.IntensityScore, + RtScore = (float)peak.RtScore, + ScanCountScore = (float)peak.ScanCountScore, + IsotopicDistributionScore = (float)peak.IsotopicDistributionScore, + + PpmErrorRaw = (float)Math.Abs(peak.MassError), + IntensityRaw = (float)Math.Log2(peak.Intensity), + RtPredictionErrorRaw = (float)Math.Abs(peak.RtPredictionError), + ScanCountRaw = (float)peak.IsotopicEnvelopes.Count, + IsotopicPearsonCorrelation = (float)(peak.IsotopicPearsonCorrelation), + + Label = label, + + }; + + return peak.PepPeakData; + } + } +} \ No newline at end of file diff --git a/mzLib/FlashLFQ/PEP/TruePositivePrediction.cs b/mzLib/FlashLFQ/PEP/TruePositivePrediction.cs new file mode 100644 index 000000000..5837959d8 --- /dev/null +++ b/mzLib/FlashLFQ/PEP/TruePositivePrediction.cs @@ -0,0 +1,18 @@ +using Microsoft.ML.Data; + +namespace FlashLFQ.PEP +{ + public class TruePositivePrediction + { + // ColumnName attribute is used to change the column name from + // its default value, which is the name of the field. + [ColumnName("PredictedLabel")] + public bool Prediction; + + // No need to specify ColumnName attribute, because the field + // name "Probability" is the column name we want. + public float Probability; + + public float Score; + } +} \ No newline at end of file diff --git a/mzLib/FlashLFQ/RtInfo.cs b/mzLib/FlashLFQ/RtInfo.cs index 0750e4588..ceda038e4 100644 --- a/mzLib/FlashLFQ/RtInfo.cs +++ b/mzLib/FlashLFQ/RtInfo.cs @@ -9,15 +9,10 @@ namespace FlashLFQ public class RtInfo { public double PredictedRt { get; } - public double Width { get; } + public double Width { get; set; } public double RtStartHypothesis => PredictedRt - (Width / 2.0); public double RtEndHypothesis => PredictedRt + (Width / 2.0); - // These will be introduced in a later PR. For now, we're sticking with the classic version - //private double _minimumWindowWidth = 0.5; - //public double RtStartHypothesis => PredictedRt - Math.Max((Width / 2.0), _minimumWindowWidth/2); // the Math.Max components ensure that the width of an RT Window is at least _minimumWindowWidth wide - //public double RtEndHypothesis => PredictedRt + Math.Max((Width / 2.0), _minimumWindowWidth/2); - public RtInfo(double predictedRt, double width) { PredictedRt = predictedRt; diff --git a/mzLib/FlashLFQ/SpectraFileInfo.cs b/mzLib/FlashLFQ/SpectraFileInfo.cs index 5dd5b4ab2..9cc24b6d7 100644 --- a/mzLib/FlashLFQ/SpectraFileInfo.cs +++ b/mzLib/FlashLFQ/SpectraFileInfo.cs @@ -1,4 +1,6 @@ -namespace FlashLFQ +using System.IO; + +namespace FlashLFQ { public class SpectraFileInfo { @@ -39,5 +41,9 @@ public override int GetHashCode() { return FullFilePathWithExtension.GetHashCode(); } + public override string ToString() + { + return Path.GetFileName(FullFilePathWithExtension); + } } } \ No newline at end of file diff --git a/mzLib/Proteomics/PSM/PsmFromTsv.cs b/mzLib/Proteomics/PSM/PsmFromTsv.cs index 95605ab49..5837c745d 100644 --- a/mzLib/Proteomics/PSM/PsmFromTsv.cs +++ b/mzLib/Proteomics/PSM/PsmFromTsv.cs @@ -259,6 +259,5 @@ public PsmFromTsv(PsmFromTsv psm, string fullSequence, int index = 0, string bas LocalizedGlycan = psm.LocalizedGlycan; } - } } diff --git a/mzLib/TestFlashLFQ/ChromatographicPeakTests.cs b/mzLib/TestFlashLFQ/ChromatographicPeakTests.cs new file mode 100644 index 000000000..cd6301a91 --- /dev/null +++ b/mzLib/TestFlashLFQ/ChromatographicPeakTests.cs @@ -0,0 +1,67 @@ +using FlashLFQ; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace TestFlashLFQ +{ + public class ChromatographicPeakTests + { + private ChromatographicPeak CreateChromatographicPeak() + { + // Create a sample SpectraFileInfo + SpectraFileInfo spectraFileInfo = new SpectraFileInfo("sampleFile", "A", 1, 1, 1); + + // Create a sample Identification + Identification identification = new Identification(spectraFileInfo, "MPEPTIDE", "M[Oxidation]PEPTIDE", 100, 10, 2, new List()); + + // Create a ChromatographicPeak instance + ChromatographicPeak chromatographicPeak = new ChromatographicPeak(identification, false, spectraFileInfo); + + IndexedMassSpectralPeak peak1 = new IndexedMassSpectralPeak(100, 300, 1, 9.5); + IndexedMassSpectralPeak peak2 = new IndexedMassSpectralPeak(100, 300, 1, 10.5); + + // Add sample IsotopicEnvelopes + chromatographicPeak.IsotopicEnvelopes = new List() + { + new IsotopicEnvelope(peak1, 2, 300, 1), + new IsotopicEnvelope(peak2, 2, 300, 1) + }; + + return chromatographicPeak; + } + + + [Test] + public void TestResolveIdentifications() + { + // Arrange + ChromatographicPeak chromatographicPeak = CreateChromatographicPeak(); + + // Act + chromatographicPeak.ResolveIdentifications(); + + // Assert + Assert.AreEqual(1, chromatographicPeak.NumIdentificationsByBaseSeq); + Assert.AreEqual(1, chromatographicPeak.NumIdentificationsByFullSeq); + } + + [Test] + public void TestToString() + { + // Arrange + ChromatographicPeak chromatographicPeak = CreateChromatographicPeak(); + + // Act + string result = chromatographicPeak.ToString(); + + // Assert + string expected = "sampleFile\tMPEPTIDE\tM[Oxidation]PEPTIDE\t\t\t100\t10\t2\t51.007276466879\t0\t-\t-\t-\t-\t-\t0\tMSMS\t\t\t1\t1\t1\t0\tNaN\tFalse\tFalse"; + Assert.AreEqual(expected, result); + } + } +} diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.cs b/mzLib/TestFlashLFQ/TestFlashLFQ.cs index 6b8be0325..ed72ef077 100644 --- a/mzLib/TestFlashLFQ/TestFlashLFQ.cs +++ b/mzLib/TestFlashLFQ/TestFlashLFQ.cs @@ -427,12 +427,18 @@ public static void TestFlashLfqNormalization() var id3 = new Identification(mzml, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List { pg }); var id4 = new Identification(mzml2, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List { pg }); - results = new FlashLfqEngine(new List { id1, id2, id3, id4 }, normalize: true).Run(); + results = new FlashLfqEngine(new List { id1, id2, id3, id4 }, normalize: true, integrate: false).Run(); int int7 = (int)System.Math.Round(results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(raw) + results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(raw2)); int int8 = (int)System.Math.Round(results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(mzml) + results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(mzml2)); Assert.That(int7 > 0); Assert.That(int7 == int8); + + results.ReNormalizeResults(true); + int int9 = (int)System.Math.Round(results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(raw) + results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(raw2)); + int int10 = (int)System.Math.Round(results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(mzml) + results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(mzml2)); + Assert.That(int9 > int7); + Assert.That(int9, Is.EqualTo(int10).Within(1)); } [Test] @@ -487,15 +493,15 @@ public static void TestFlashLfqMergeResults() public static void TestFlashLfqMatchBetweenRuns() { List filesToWrite = new List { "mzml_1", "mzml_2" }; - List pepSequences = new List - { - "PEPTIDE", - "PEPTIDEV", - "PEPTIDEVV", + List pepSequences = new List + { + "PEPTIDE", + "PEPTIDEV", + "PEPTIDEVV", "TARGETPEP", "PEPTIDEVVV", - "PEPTIDEVVVV", - "PEPTIDEVVVVA", + "PEPTIDEVVVV", + "PEPTIDEVVVVA", "PEPTIDEVVVVAA" }; double intensity = 1e6; @@ -593,9 +599,6 @@ public static void TestFlashLfqMatchBetweenRuns() FlashLfqEngine engine = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10 }, matchBetweenRuns: true); FlashLfqEngine interquartileEngine = new FlashLfqEngine( new List { id1, id2, id3, id4, id5, id11, id12, id6, id7, id9, id10, id13, id14 }, matchBetweenRuns: true); - FlashLfqEngine engineAmbiguous = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10, id18, id15, id16, id17 }, matchBetweenRuns: true, - peptideSequencesToUse: pepSequences); - //run the engine var results = engine.Run(); @@ -628,14 +631,14 @@ public static void TestFlashLfqMatchBetweenRuns() rtDiffs.Add(Math.Abs(file1Rt[i] - file2Rt[i])); } + FlashLfqEngine engineAmbiguous = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10, id18, id15, id16, id17 }, matchBetweenRuns: true, peptideSequencesToQuantify: pepSequences, donorCriterion: DonorCriterion.Intensity); // The ambiguous engine tests that a non-confident ID (i.e., a PSM that didn't make the peptide level fdr cutoff) - // gets overwritten by a MBR transfer of a confident ID, and that non-confident IDs are overwriteen by confident MS2 ids + // gets overwritten by a MBR transfer of a confident ID, and that non-confident IDs are overwritten by confident MS2 ids results = engineAmbiguous.Run(); Assert.False(results.PeptideModifiedSequences.Select(kvp => kvp.Key).Contains("DECOYPEP")); Assert.False(results.Peaks[file1].Any(peak => peak.Identifications.Any(id => id.ModifiedSequence.Contains("DECOYPEP")))); Assert.That(results.Peaks[file2].Any(peak => peak.Identifications.First().ModifiedSequence == "TARGETPEP")); Assert.AreEqual(results.Peaks[file2].Count(peak => peak.IsMbrPeak), 2); - } [Test] @@ -1036,7 +1039,7 @@ public static void TestMatchBetweenRunsWithNoIdsInCommon() FlashLfqEngine engine = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10 }, matchBetweenRuns: true); var results = engine.Run(); - // no assertions - just don't crash + Assert.Pass();// no assertions - just don't crash } [Test] @@ -1207,7 +1210,11 @@ public static void TestFlashLfqQoutputRealData() } } - var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, useSharedPeptidesForProteinQuant: true, maxThreads: -1); + var engine = new FlashLfqEngine(ids, + matchBetweenRuns: true, + requireMsmsIdInCondition: false, + useSharedPeptidesForProteinQuant: true, + maxThreads: -1); var results = engine.Run(); results.WriteResults(Path.Combine(outputDirectory,"peaks.tsv"), Path.Combine(outputDirectory, "peptides.tsv"), Path.Combine(outputDirectory, "proteins.tsv"), Path.Combine(outputDirectory, "bayesian.tsv"),true); @@ -1219,8 +1226,8 @@ public static void TestFlashLfqQoutputRealData() Assert.AreEqual(4, peaks[0].Count(m => m.IsMbrPeak == false)); Assert.AreEqual(5, peaks[1].Count(m => m.IsMbrPeak == false)); - CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "Q7KZF4", "P52298", "Q15149" }, peaks[0].SelectMany(i => i.Identifications).Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); - CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "P52298", "Q15149", "Q7KZF4", "Q7KZF4", "P52298" }, peaks[1].SelectMany(i => i.Identifications).Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "Q7KZF4", "P52298", "Q15149", "Q15149" }, peaks[0].SelectMany(i => i.Identifications).Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "P52298", "Q15149", "Q15149", "Q7KZF4", "Q7KZF4", "P52298" }, peaks[1].SelectMany(i => i.Identifications).Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); Assert.AreEqual(6, peptides.Count); CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "P52298", "Q15149", "Q15149", "Q7KZF4", "P52298" }, peptides.Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); @@ -1344,6 +1351,7 @@ public static void RealDataMbrTest() double rt = double.Parse(split[2]); int z = (int)double.Parse(split[6]); var proteins = split[24].Split(new char[] { '|' }); + bool decoyPeptide = split[39].Equals("D"); List proteinGroups = new List(); foreach (var protein in proteins) { @@ -1358,66 +1366,62 @@ public static void RealDataMbrTest() } } - Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups); + Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups, decoy: decoyPeptide); ids.Add(id); } - var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 5); + var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 1, matchBetweenRunsFdrThreshold: 0.15, maxMbrWindow: 1); var results = engine.Run(); + // Count the number of MBR results in each file var f1r1MbrResults = results .PeptideModifiedSequences - .Where(p => p.Value.GetDetectionType(f1r1) == DetectionType.MBR && p.Value.GetDetectionType(f1r2) == DetectionType.MSMS).ToList(); - - Assert.That(f1r1MbrResults.Count >= 132); - - var f1r2MbrResults = results.PeptideModifiedSequences - .Where(p => p.Value.GetDetectionType(f1r1) == DetectionType.MSMS && p.Value.GetDetectionType(f1r2) == DetectionType.MBR).ToList(); - - Assert.GreaterOrEqual(f1r2MbrResults.Count, 77); - - List<(double, double)> peptideIntensities = new List<(double, double)>(); + .Where(p => p.Value.GetDetectionType(f1r1) == DetectionType.MBR && p.Value.GetDetectionType(f1r2) == DetectionType.MSMS) + .ToList(); + var f1r2MbrResults = results + .PeptideModifiedSequences + .Where(p => p.Value.GetDetectionType(f1r1) == DetectionType.MSMS && p.Value.GetDetectionType(f1r2) == DetectionType.MBR) + .ToList(); - foreach (var peptide in f1r1MbrResults) - { - double mbrIntensity = Math.Log(peptide.Value.GetIntensity(f1r1)); - double msmsIntensity = Math.Log(peptide.Value.GetIntensity(f1r2)); - peptideIntensities.Add((mbrIntensity, msmsIntensity)); - } + // Due to the small number of results in the test data, the counts and correlation values can be quite variable. + // Any change to ML.NET or the PEP Analysis engine will cause these to change. + Console.WriteLine("r1 PIP event count: " + f1r1MbrResults.Count); + Console.WriteLine("r2 PIP event count: " + f1r2MbrResults.Count); + Assert.AreEqual(138, f1r1MbrResults.Count); + Assert.AreEqual(70, f1r2MbrResults.Count); - double corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - Assert.Greater(corr, 0.8); + // Check that MS/MS identified peaks and MBR identified peaks have similar intensities + List<(double, double)> peptideIntensities = f1r1MbrResults.Select(pep => (Math.Log(pep.Value.GetIntensity(f1r1)), Math.Log(pep.Value.GetIntensity(f1r2)))).ToList(); + double corrRun1 = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - peptideIntensities.Clear(); - foreach (var peptide in f1r2MbrResults) - { - double mbrIntensity = Math.Log(peptide.Value.GetIntensity(f1r2)); - double msmsIntensity = Math.Log(peptide.Value.GetIntensity(f1r1)); - peptideIntensities.Add((mbrIntensity, msmsIntensity)); - } - - corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); + peptideIntensities = f1r2MbrResults.Select(pep => (Math.Log(pep.Value.GetIntensity(f1r1)), Math.Log(pep.Value.GetIntensity(f1r2)))).ToList(); + double corrRun2 = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - // Update means more MBR-detections, which decreases the correlation slightly. Will increase again when we begin filtering based on MBR score - Assert.Greater(corr, 0.69); + // These values are also sensitive, changes can cause them to dip as low as 0.6 (specifically the corrRun2 value) + Console.WriteLine("r1 correlation: " + corrRun1); + Console.WriteLine("r2 correlation: " + corrRun2); + Assert.Greater(corrRun1, 0.75); + Assert.Greater(corrRun2, 0.65); // the "requireMsmsIdInCondition" field requires that at least one MS/MS identification from a protein // has to be observed in a condition for match-between-runs f1r1.Condition = "b"; engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 5); results = engine.Run(); - var proteinsObservedInF1 = ids.Where(p => p.FileInfo == f1r1).SelectMany(p => p.ProteinGroups).Distinct().ToList(); - var proteinsObservedInF2 = ids.Where(p => p.FileInfo == f1r2).SelectMany(p => p.ProteinGroups).Distinct().ToList(); + var proteinsObservedInF1 = ids.Where(id => !id.IsDecoy).Where(p => p.FileInfo == f1r1).SelectMany(p => p.ProteinGroups).Distinct().ToList(); + var proteinsObservedInF2 = ids.Where(id => !id.IsDecoy).Where(p => p.FileInfo == f1r2).SelectMany(p => p.ProteinGroups).Distinct().ToList(); var proteinsObservedInF1ButNotF2 = proteinsObservedInF1.Except(proteinsObservedInF2).ToList(); foreach (ProteinGroup protein in proteinsObservedInF1ButNotF2) { Assert.That(results.ProteinGroups[protein.ProteinGroupName].GetIntensity(f1r2) == 0); } - List peptidesToUse = ids.Select(id => id.ModifiedSequence).Take(400).Distinct().ToList(); - engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 1, peptideSequencesToUse: peptidesToUse); + // Test that no decoys are reported in the final resultsw + Assert.AreEqual(0, ids.Where(id => id.IsDecoy).Count(id => results.ProteinGroups.ContainsKey(id.ProteinGroups.First().ProteinGroupName))); + + List peptidesToUse = ids.Where(id => id.QValue <= 0.007 & !id.IsDecoy).Select(id => id.ModifiedSequence).Distinct().ToList(); + engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 1, matchBetweenRunsFdrThreshold: 0.5, maxMbrWindow: 1, peptideSequencesToQuantify: peptidesToUse); results = engine.Run(); - var test = results.PeptideModifiedSequences.Select(kvp => !peptidesToUse.Contains(kvp.Key)).ToList(); CollectionAssert.AreEquivalent(results.PeptideModifiedSequences.Select(kvp => kvp.Key), peptidesToUse); } @@ -1652,14 +1656,13 @@ public static void TestAmbiguousFraction() peak1.ResolveIdentifications(); peak2.ResolveIdentifications(); - peak1.IsotopicEnvelopes.Add(new FlashLFQ.IsotopicEnvelope(new IndexedMassSpectralPeak(0, 0, 0, 0), 1, 1000)); - peak2.IsotopicEnvelopes.Add(new FlashLFQ.IsotopicEnvelope(new IndexedMassSpectralPeak(0, 0, 0, 0), 1, 10000)); + peak1.IsotopicEnvelopes.Add(new FlashLFQ.IsotopicEnvelope(new IndexedMassSpectralPeak(0, 0, 0, 0), 1, 1000, 1)); + peak2.IsotopicEnvelopes.Add(new FlashLFQ.IsotopicEnvelope(new IndexedMassSpectralPeak(0, 0, 0, 0), 1, 10000, 1)); peak1.CalculateIntensityForThisFeature(false); peak2.CalculateIntensityForThisFeature(false); - FlashLfqResults res = new FlashLfqResults(new List { fraction1, fraction2 }, new List { id1, id2, id3 }, - new HashSet { "peptide1", "peptide2"}); + FlashLfqResults res = new FlashLfqResults(new List { fraction1, fraction2 }, new List { id1, id2, id3 }); res.Peaks[fraction1].Add(peak1); res.Peaks[fraction2].Add(peak2); res.CalculatePeptideResults(quantifyAmbiguousPeptides: false); diff --git a/mzLib/TestFlashLFQ/TestPipEcho.cs b/mzLib/TestFlashLFQ/TestPipEcho.cs new file mode 100644 index 000000000..0d2388142 --- /dev/null +++ b/mzLib/TestFlashLFQ/TestPipEcho.cs @@ -0,0 +1,313 @@ +using NUnit.Framework; +using Readers; +using System.Collections.Generic; +using System.Linq; +using FlashLFQ; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using System.IO; +using FlashLFQ.PEP; +using System; +using Chemistry; +using MassSpectrometry; +using MzLibUtil; +using Test.FileReadingTests; +using UsefulProteomicsDatabases; + + +namespace TestFlashLFQ +{ + [TestFixture] + [System.Diagnostics.CodeAnalysis.ExcludeFromCodeCoverage] + public class TestPipEcho + { + [Test] + [TestCase(3)] + [TestCase(5)] + public static void TestDonorGroupEqualizer(int numGroups) + { + SpectraFileInfo fakeFile = new SpectraFileInfo("fakeFile", "A", 1, 1, 1); + Identification id = new Identification(fakeFile, "KPVGAAK", "KPVGAAK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + + ChromatographicPeak targetPeak = new ChromatographicPeak(id, isMbrPeak: true, fakeFile, randomRt: false); + ChromatographicPeak decoyPeak = new ChromatographicPeak(id, isMbrPeak: true, fakeFile, randomRt: true); + targetPeak.MbrScore = 100; + + Random random = new Random(42); + List donorGroups = new List(); + for (int i = 0; i < 10000; i++) + { + int numberTargets = random.Next(0, 10); + int numberDecoys = random.Next(0, 10); + donorGroups.Add(new DonorGroup(id, Enumerable.Repeat(targetPeak, numberTargets).ToList(), Enumerable.Repeat(decoyPeak, numberDecoys).ToList())); + } + + donorGroups = PepAnalysisEngine.OrderDonorGroups(donorGroups); + var donorIndices = PepAnalysisEngine.GetDonorGroupIndices(donorGroups, numGroups: numGroups, scoreCutoff: 50); + + Assert.That(donorIndices.Count, Is.EqualTo(numGroups)); + List targetPeakCounts = new(); + List decoyPeakCounts = new(); + for (int i = 0; i < numGroups; i++) + { + int targetSum = 0; + int decoySum = 0; + foreach (int idx in donorIndices[i]) + { + targetSum += donorGroups[idx].TargetAcceptors.Count; + decoySum += donorGroups[idx].DecoyAcceptors.Count; + } + targetPeakCounts.Add(targetSum); + decoyPeakCounts.Add(decoySum); + } + + // Assert that each group has an approximately equal number of target peaks + Assert.That(targetPeakCounts.Max() - targetPeakCounts.Min(), Is.LessThanOrEqualTo(numGroups-1)); + // Assert that each group has an approximately equal number of decoy peaks + Assert.That(decoyPeakCounts.Max() - decoyPeakCounts.Min(), Is.LessThanOrEqualTo(numGroups - 1)); + } + + [Test] + public static void TestMbrScorer() + { + SpectraFileInfo fakeFile = new SpectraFileInfo("fakeFile", "A", 1, 1, 1); + SpectraFileInfo fakeDonorFile = new SpectraFileInfo("fakeFile", "A", 1, 1, 1); + + double idMass = 669.4173; + Identification id = new Identification(fakeFile, "KPVGAAK", "KPVGAAK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + Identification id2 = new Identification(fakeFile, "KPVGK", "KPVGK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + Identification donorId = new Identification(fakeFile, "KPVGK", "KPVGK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + id.PeakfindingMass = idMass; + id2.PeakfindingMass = idMass; + donorId.PeakfindingMass = idMass; + + var peak1 = new ChromatographicPeak(id, isMbrPeak: false, fakeFile, randomRt: false); + var peak2 = new ChromatographicPeak(id, isMbrPeak: false, fakeFile, randomRt: false); + var peak3 = new ChromatographicPeak(id2, isMbrPeak: false, fakeFile, randomRt: false); + var peak4 = new ChromatographicPeak(id, isMbrPeak: false, fakeFile, randomRt: false); + var donorPeak = new ChromatographicPeak(donorId, isMbrPeak: false, fakeDonorFile, randomRt: false); + var acceptorPeak = new ChromatographicPeak(donorId, isMbrPeak: true, fakeFile, randomRt: false); + + IndexedMassSpectralPeak imsPeak = new IndexedMassSpectralPeak((idMass + 0.001).ToMz(1), 1.1, 1, 25); + IndexedMassSpectralPeak imsPeak2 = new IndexedMassSpectralPeak((idMass - 0.001).ToMz(1), 1, 2, 26); + var iso1 = new FlashLFQ.IsotopicEnvelope(imsPeak, 1, 1, 0.98); + var iso2 = new FlashLFQ.IsotopicEnvelope(imsPeak2, 1, 1, 0.9); + + peak1.IsotopicEnvelopes.Add(iso1); + peak1.IsotopicEnvelopes.Add(iso2); + peak1.CalculateIntensityForThisFeature(false); + + peak4.IsotopicEnvelopes.Add(iso2); + peak4.CalculateIntensityForThisFeature(false); + + donorPeak.IsotopicEnvelopes.Add(iso2); + donorPeak.CalculateIntensityForThisFeature(false); + + acceptorPeak.IsotopicEnvelopes.Add(iso1); + acceptorPeak.CalculateIntensityForThisFeature(false); + + + var peakList = new List { peak1, peak4 }; + var peakDict = peakList.ToDictionary(keySelector: p => p.Apex.IndexedPeak, elementSelector: p => p); + + // Builds a scorer. Ppm Error and Intensity distributions both have mean and std-dev of 1 + MbrScorer scorer = new MbrScorer(peakDict, peakList, new MathNet.Numerics.Distributions.Normal(1, 1), new MathNet.Numerics.Distributions.Normal(1,1)); + + scorer.AddRtPredErrorDistribution(fakeDonorFile, new List { 0.5, 0.6, 0.5, 0.6, 0.5, 0.6, 0.5 }, 2); + + acceptorPeak.MbrScore = scorer.ScoreMbr(acceptorPeak, donorPeak, predictedRt: 25.1); + + Assert.That(acceptorPeak.MbrScore, Is.EqualTo(58.7).Within(0.1)); + Assert.That(acceptorPeak.PpmScore, Is.EqualTo(0.62).Within(0.01)); + Assert.That(acceptorPeak.IntensityScore, Is.EqualTo(0.32).Within(0.01)); + Assert.That(acceptorPeak.RtScore, Is.EqualTo(0.96).Within(0.01)); + Assert.That(acceptorPeak.ScanCountScore, Is.EqualTo(0.5).Within(0.01)); + Assert.That(acceptorPeak.IsotopicDistributionScore, Is.EqualTo(0.74).Within(0.01)); + } + + [Test] + public static void TestSpectraFileInfoString() + { + SpectraFileInfo fakeFile = new SpectraFileInfo(@"C:\Users\xyz\data\fakeFile.raw", "A", 1, 1, 1); + Assert.AreEqual("fakeFile.raw", fakeFile.ToString()); + } + + [Test] + public static void TestChromatographicPeakEquals() + { + SpectraFileInfo fakeFile = new SpectraFileInfo("fakeFile", "A", 1, 1, 1); + Identification id = new Identification(fakeFile, "KPVGAAK", "KPVGAAK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + Identification id2 = new Identification(fakeFile, "KPVGK", "KPVGK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + + var peak1 = new ChromatographicPeak(id, isMbrPeak: true, fakeFile, randomRt: false); + var peak2 = new ChromatographicPeak(id, isMbrPeak: true, fakeFile, randomRt: false); + var peak3 = new ChromatographicPeak(id2, isMbrPeak: true, fakeFile, randomRt: false); + var peak4 = new ChromatographicPeak(id, isMbrPeak: true, fakeFile, randomRt: false); + + IndexedMassSpectralPeak imsPeak = new IndexedMassSpectralPeak(1, 1, 1, 25); + IndexedMassSpectralPeak imsPeak2 = new IndexedMassSpectralPeak(1, 1, 1, 50); + var iso1 = new FlashLFQ.IsotopicEnvelope(imsPeak, 1, 1, 1); + var iso2 = new FlashLFQ.IsotopicEnvelope(imsPeak2, 1, 1, 1); + + peak1.IsotopicEnvelopes.Add(iso1); + peak1.CalculateIntensityForThisFeature(false); + + peak2.IsotopicEnvelopes.Add(iso1); + peak2.CalculateIntensityForThisFeature(false); + + peak3.IsotopicEnvelopes.Add(iso1); + peak3.CalculateIntensityForThisFeature(false); + + peak4.IsotopicEnvelopes.Add(iso2); + peak4.CalculateIntensityForThisFeature(false); + + Assert.That(peak1.Equals(peak2)); + Assert.That(!peak1.Equals(peak3)); + Assert.That(!peak1.Equals(peak4)); + + } + + /// + /// This test MatchBetweenRuns by creating two fake mzML files and a list of fake IDs. + /// There are multiple sets of IDs, where most are shared between the two runs but one+ is/are missing + /// MBR is tested by ensuring that IDs are transferred between runs + /// + [Test] + public static void TestFlashLfqMatchBetweenRunsNearestNeighborDonors() + { + List filesToWrite = new List { "mzml_1", "mzml_2", "mzml_3" }; + List pepSequences = new List + { + "PEPTIDE", + "PEPTIDEV", + "PEPTIDEVV", + "TARGETPEP", + "PEPTIDEVVV", + "PEPTIDEVVVV", + "PEPTIDEVVVVA", + "PEPTIDEVVVVAA" + }; + double intensity = 1e6; + + double[] file1Rt = new double[] { 1.01, 1.02, 1.03, 1.033, 1.035, 1.04, 1.045, 1.05 }; + double[] file2Rt = new double[] { 1.00, 1.025, 1.03, 1.031, 1.035, 1.04, 1.055, 1.07 }; + + Loaders.LoadElements(); + + // generate mzml files (5 peptides each) + for (int f = 0; f < filesToWrite.Count; f++) + { + // 1 MS1 scan per peptide + MsDataScan[] scans = new MsDataScan[8]; + + for (int p = 0; p < pepSequences.Count; p++) + { + ChemicalFormula cf = new Proteomics.AminoAcidPolymer.Peptide(pepSequences[p]).GetChemicalFormula(); + IsotopicDistribution dist = IsotopicDistribution.GetDistribution(cf, 0.125, 1e-8); + double[] mz = dist.Masses.Select(v => v.ToMz(1)).ToArray(); + double[] intensities = dist.Intensities.Select(v => v * intensity).ToArray(); + if(f == 2) + { + // Make file 3 the most intense + intensities = intensities.Select(v => v * 5).ToArray(); + } + double rt; + if (f == 1) + { + rt = file2Rt[p]; + } + else + { + rt = file1Rt[p]; + } + + // add the scan + scans[p] = new MsDataScan(massSpectrum: new MzSpectrum(mz, intensities, false), oneBasedScanNumber: p + 1, msnOrder: 1, isCentroid: true, + polarity: Polarity.Positive, retentionTime: rt, scanWindowRange: new MzRange(400, 1600), scanFilter: "f", + mzAnalyzer: MZAnalyzerType.Orbitrap, totalIonCurrent: intensities.Sum(), injectionTime: 1.0, noiseData: null, nativeId: "scan=" + (p + 1)); + } + + // write the .mzML + Readers.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(new FakeMsDataFile(scans), + Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[f] + ".mzML"), false); + } + + // set up spectra file info + SpectraFileInfo file1 = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[0] + ".mzML"), "a", 0, 0, 0); + SpectraFileInfo file2 = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[1] + ".mzML"), "a", 1, 0, 0); + SpectraFileInfo file3 = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[2] + ".mzML"), "a", 2, 0, 0); + + // create some PSMs + var pg = new ProteinGroup("MyProtein", "gene", "org"); + Identification id1 = new Identification(file1, "PEPTIDE", "PEPTIDE", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDE").MonoisotopicMass, file1Rt[0] + 0.001, 1, new List { pg }); + Identification id2 = new Identification(file1, "PEPTIDEV", "PEPTIDEV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEV").MonoisotopicMass, file1Rt[1] + 0.001, 1, new List { pg }); + Identification id3 = new Identification(file1, "PEPTIDEVV", "PEPTIDEVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVV").MonoisotopicMass, file1Rt[2] + 0.001, 1, new List { pg }); + Identification id4 = new Identification(file1, "PEPTIDEVVV", "PEPTIDEVVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file1Rt[4] + 0.001, 1, new List { pg }); + Identification id5 = new Identification(file1, "PEPTIDEVVVV", "PEPTIDEVVVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file1Rt[5] + 0.001, 1, new List { pg }); + + Identification id6 = new Identification(file2, "PEPTIDE", "PEPTIDE", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDE").MonoisotopicMass, file2Rt[0] + 0.001, 1, new List { pg }); + Identification id7 = new Identification(file2, "PEPTIDEV", "PEPTIDEV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEV").MonoisotopicMass, file2Rt[1] + 0.001, 1, new List { pg }); + // missing ID 8 - MBR feature - "PEPTIDEVV" + + Identification id9 = new Identification(file2, "PEPTIDEVVV", "PEPTIDEVVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file2Rt[4] + 0.001, 1, new List { pg }); + Identification id10 = new Identification(file2, "PEPTIDEVVVV", "PEPTIDEVVVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file2Rt[5] + 0.001, 1, new List { pg }); + + + Identification id11 = new Identification(file3, "PEPTIDEV", "PEPTIDEV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEV").MonoisotopicMass, file1Rt[1] + 0.001, 1, new List { pg }); // same as peak 2 + Identification id12 = new Identification(file3, "PEPTIDEVV", "PEPTIDEVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVV").MonoisotopicMass, file1Rt[2] + 0.001, 1, new List { pg }); // same as peak 3, but higher intensity + Identification id13 = new Identification(file3, "PEPTIDEVVV", "PEPTIDEVVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file1Rt[4] + 0.001, 1, new List { pg }); // same as peak 4 + + + // create the FlashLFQ engine + FlashLfqEngine neighborsEngine = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10, id11, id12, id13 }, + matchBetweenRuns: true, donorCriterion: DonorCriterion.Neighbors); + + //run the engine + var results = neighborsEngine.Run(); + + Assert.That(results.Peaks[file2].Count == 5); + Assert.That(results.Peaks[file2].Where(p => p.IsMbrPeak).Count() == 1); + + var peak = results.Peaks[file2].Where(p => p.IsMbrPeak).First(); + var otherFilePeak = results.Peaks[file1].Where(p => p.Identifications.First().BaseSequence == + peak.Identifications.First().BaseSequence).First(); + + + Assert.That(peak.Intensity > 0); + Assert.That(peak.Intensity == otherFilePeak.Intensity); + Assert.That(peak.Identifications.First().FileInfo == file1); // assure that the ID came from file 1, ie, the donor with the most neighboring peaks + + // create the FlashLFQ engine + FlashLfqEngine intensityEngine = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10, id11, id12, id13 }, + matchBetweenRuns: true, donorCriterion: DonorCriterion.Intensity); + + //run the engine + results = intensityEngine.Run(); + + Assert.That(results.Peaks[file2].Count == 5); + Assert.That(results.Peaks[file2].Where(p => p.IsMbrPeak).Count() == 1); + + peak = results.Peaks[file2].Where(p => p.IsMbrPeak).First(); + otherFilePeak = results.Peaks[file3].Where(p => p.Identifications.First().BaseSequence == + peak.Identifications.First().BaseSequence).First(); + + + Assert.That(peak.Intensity > 0); + Assert.That(peak.Intensity, Is.EqualTo(otherFilePeak.Intensity/5).Within(1)); // file 3 is five times more intense than file 2 + Assert.That(peak.Identifications.First().FileInfo == file3); // assure that the ID came from file 3, ie, the most intense donor peaks + + } + + } +} diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index 626ecf12f..3aa393afe 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -16,6 +16,8 @@ + + @@ -26,6 +28,8 @@ + +