diff --git a/mzLib/MzLibUtil/MzLibException.cs b/mzLib/MzLibUtil/MzLibException.cs index 61ecc8d6b..885081433 100644 --- a/mzLib/MzLibUtil/MzLibException.cs +++ b/mzLib/MzLibUtil/MzLibException.cs @@ -1,8 +1,9 @@ -using System; +#nullable enable +using System; namespace MzLibUtil { [Serializable] - public class MzLibException(string message, Exception innerException = null) + public class MzLibException(string message, Exception? innerException = null) : Exception(message, innerException); } \ No newline at end of file diff --git a/mzLib/Omics/IBioPolymerWithSetMods.cs b/mzLib/Omics/IBioPolymerWithSetMods.cs index 0b9926a01..12989b1f3 100644 --- a/mzLib/Omics/IBioPolymerWithSetMods.cs +++ b/mzLib/Omics/IBioPolymerWithSetMods.cs @@ -82,5 +82,86 @@ public static string GetBaseSequenceFromFullSequence(string fullSequence) } return sb.ToString(); } + + /// + /// Returns a list of modifications and their OneBased index from a full sequence + /// + /// Full sequence + /// All known modifications + /// + /// When a full sequence is not in the correct format or a mod is not found in the allModsKnown dictionary + public static Dictionary GetModificationDictionaryFromFullSequence(string fullSequence, + Dictionary allModsKnown) + { + var allModsOneIsNterminus = new Dictionary(); + var baseSequence = GetBaseSequenceFromFullSequence(fullSequence); + int currentModStart = 0; + int currentModificationLocation = 1; + bool currentlyReadingMod = false; + int bracketCount = 0; + + for (int r = 0; r < fullSequence.Length; r++) + { + char c = fullSequence[r]; + if (c == '[') + { + currentlyReadingMod = true; + if (bracketCount == 0) + { + currentModStart = r + 1; + } + bracketCount++; + } + else if (c == ']') + { + string modId = null; + bracketCount--; + if (bracketCount == 0) + { + try + { + //remove the beginning section (e.g. "Fixed", "Variable", "Uniprot") + string modString = fullSequence.Substring(currentModStart, r - currentModStart); + int splitIndex = modString.IndexOf(':'); + string modType = modString.Substring(0, splitIndex); + modId = modString.Substring(splitIndex + 1, modString.Length - splitIndex - 1); + } + catch (Exception e) + { + throw new MzLibUtil.MzLibException( + "Error while trying to parse string into peptide: " + e.Message, e); + + } + if (!allModsKnown.TryGetValue(modId, out var mod)) + { + throw new MzLibUtil.MzLibException( + "Could not find modification while reading string: " + fullSequence); + } + if (mod.LocationRestriction.Contains("C-terminal.") && r == fullSequence.Length - 1) + { + currentModificationLocation = baseSequence.Length + 2; + } + allModsOneIsNterminus.Add(currentModificationLocation, mod); + currentlyReadingMod = false; + } + } + else if (!currentlyReadingMod) + { + currentModificationLocation++; + } + //else do nothing + } + + return allModsOneIsNterminus; + } + + /// + /// Returns a list of modifications from a full sequence + /// + /// Full sequence + /// All known modifications + /// + public static List GetModificationsFromFullSequence(string fullSequence, + Dictionary allModsKnown) => [.. GetModificationDictionaryFromFullSequence(fullSequence, allModsKnown).Values]; } } diff --git a/mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs b/mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs index 1d7f1b231..1abb40e99 100644 --- a/mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs +++ b/mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs @@ -1103,7 +1103,7 @@ private void ParseSequence(string sequence) { modification = new OldSchoolChemicalFormulaModification(ChemicalFormula.ParseFormula(modString)); } - catch (MzLibException) + catch (MzLibException e) { if (double.TryParse(modString, out double mass)) { @@ -1111,7 +1111,7 @@ private void ParseSequence(string sequence) } else { - throw new MzLibException("Unable to correctly parse the following modification: " + modString); + throw new MzLibException("Unable to correctly parse the following modification: " + modString, e); } } diff --git a/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs b/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs index 8eb6e6bdf..aafec0a5e 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs @@ -69,7 +69,7 @@ public PeptideWithSetModifications(string sequence, Dictionary public void SetNonSerializedPeptideInfo(Dictionary idToMod, Dictionary accessionToProtein, DigestionParams dp) { - GetModsAfterDeserialization(idToMod); + _allModsOneIsNterminus = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(FullSequence, idToMod); GetProteinAfterDeserialization(accessionToProtein); _digestionParams = dp; } @@ -919,66 +919,6 @@ public void SetNonSerializedPeptideInfo(Dictionary idToMod Dictionary accessionToProtein, IDigestionParams dp) => SetNonSerializedPeptideInfo(idToMod, accessionToProtein, (DigestionParams)dp); - private void GetModsAfterDeserialization(Dictionary idToMod) - { - _allModsOneIsNterminus = new Dictionary(); - int currentModStart = 0; - int currentModificationLocation = 1; - bool currentlyReadingMod = false; - int bracketCount = 0; - - for (int r = 0; r < FullSequence.Length; r++) - { - char c = FullSequence[r]; - if (c == '[') - { - currentlyReadingMod = true; - if (bracketCount == 0) - { - currentModStart = r + 1; - } - bracketCount++; - } - else if (c == ']') - { - string modId = null; - bracketCount--; - if (bracketCount == 0) - { - try - { - //remove the beginning section (e.g. "Fixed", "Variable", "Uniprot") - string modString = FullSequence.Substring(currentModStart, r - currentModStart); - int splitIndex = modString.IndexOf(':'); - string modType = modString.Substring(0, splitIndex); - modId = modString.Substring(splitIndex + 1, modString.Length - splitIndex - 1); - } - catch (Exception e) - { - throw new MzLibUtil.MzLibException( - "Error while trying to parse string into peptide: " + e.Message); - } - if (!idToMod.TryGetValue(modId, out Modification mod)) - { - throw new MzLibUtil.MzLibException( - "Could not find modification while reading string: " + FullSequence); - } - if (mod.LocationRestriction.Contains("C-terminal.") && r == FullSequence.Length - 1) - { - currentModificationLocation = BaseSequence.Length + 2; - } - _allModsOneIsNterminus.Add(currentModificationLocation, mod); - currentlyReadingMod = false; - } - } - else if (!currentlyReadingMod) - { - currentModificationLocation++; - } - //else do nothing - } - } - private void GetProteinAfterDeserialization(Dictionary idToProtein) { Protein protein = null; diff --git a/mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs b/mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs index 62a720c63..709b391ba 100644 --- a/mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs +++ b/mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs @@ -28,7 +28,7 @@ public static List ReadTsv(string filePath, out List un = new Dictionary(); + var psiModDeserialized = Loaders.LoadPsiMod(Path.Combine(TestContext.CurrentContext.TestDirectory, "PSI-MOD.obo2.xml")); + Dictionary formalChargesDictionary = Loaders.GetFormalChargesDictionary(psiModDeserialized); + List UniProtPtms = Loaders.LoadUniprot(Path.Combine(TestContext.CurrentContext.TestDirectory, "ptmlist2.txt"), + formalChargesDictionary).ToList(); + List proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "cRAP_databaseGPTMD.xml"), + true, DecoyType.None, UniProtPtms, false, new string[] { "exclude_me" }, out un); + var allKnownModDict = UniProtPtms.ToDictionary(p => p.IdWithMotif, p => p); + var digestionParameters = new DigestionParams(maxModsForPeptides: 3); + + foreach (Protein p in proteins) + { + List digestedPeptides = + p.Digest(digestionParameters, [], [], null, null).ToList(); + // take the most modified peptide by base sequence and ensure all methods function properly + foreach (var targetPeptide in digestedPeptides + .Where(pep => pep.FullSequence.Contains('[')) + .GroupBy(pep => pep.BaseSequence) + .Select(pepGroup => pepGroup.MaxBy(pep => pep.AllModsOneIsNterminus.Count))) + { + var startResidue = targetPeptide.OneBasedStartResidue; + var endResidue = targetPeptide.OneBasedEndResidue; + + // Pull our expected modifications based upon parent protein object with a maximum value of DigestionParameters.MaxMods + // A bunch of logic to count the number of expected modifications based upon the xml database entries + int expectedModCount = 0; + foreach (var modDictEntry in p.OneBasedPossibleLocalizedModifications + .Where(mod => mod.Key >= startResidue && mod.Key <= endResidue)) + { + if (modDictEntry.Value.Count > 1) + { + var locRestrictions = modDictEntry.Value.Select(mod => mod.LocationRestriction).ToList(); + + if (locRestrictions.AllSame()) + { + if (locRestrictions.First() == "Anywhere.") + expectedModCount++; + else if (locRestrictions.First() == "N-terminal." && modDictEntry.Key == startResidue) + expectedModCount++; + } + else if (modDictEntry.Value.Select(mod => mod.LocationRestriction).Contains("Anywhere.") + && modDictEntry.Value.Select(mod => mod.LocationRestriction) + .Contains("N-terminal.")) + { + expectedModCount++; + if (modDictEntry.Key == startResidue) + expectedModCount++; + } + } + else + { + switch (modDictEntry.Value.First().LocationRestriction) + { + case "Anywhere.": + case "N-terminal." when modDictEntry.Key == startResidue: + expectedModCount++; + break; + } + } + } + + expectedModCount = Math.Min(expectedModCount, digestionParameters.MaxMods); + + var expectedModifications = p.OneBasedPossibleLocalizedModifications.Where(mod => + mod.Key >= startResidue && + mod.Key <= endResidue).SelectMany(mod => mod.Value).ToList(); + + // Parse modifications from PWSM and two IBioPolymerWithSetMods methods + var pwsmModDict = targetPeptide.AllModsOneIsNterminus; + var bpwsmModDict = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(targetPeptide.FullSequence, allKnownModDict); + var bpwsmModList = IBioPolymerWithSetMods.GetModificationsFromFullSequence(targetPeptide.FullSequence, allKnownModDict); + + // Ensure all methods are in agreement by modification count + Assert.AreEqual(pwsmModDict.Count, expectedModCount); + Assert.AreEqual(bpwsmModDict.Count, expectedModCount); + Assert.AreEqual(bpwsmModList.Count, expectedModCount); + + // Ensure all methods are in agreement by modification identify + foreach (var pwsmModification in pwsmModDict.Values) + Assert.Contains(pwsmModification, expectedModifications); + foreach (var pwsmModification in bpwsmModDict.Values) + Assert.Contains(pwsmModification, expectedModifications); + foreach (var pwsmModification in bpwsmModList) + Assert.Contains(pwsmModification, expectedModifications); + } + } + } } } \ No newline at end of file