-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Added in ability to parse TopPIC search result files * Added ability to write entire header * Added comments * Excluded Test class from code coverage * Adjusted some fields to match ours better * Handled a few edge cases * Added support for TopPIC version 1.5.3 * Added support for TopPIC version 1.5.3 * Fixed up operators --------- Co-authored-by: Nic Bollis <nbollis@wisc.edu> Co-authored-by: trishorts <mshort@chem.wisc.edu>
- Loading branch information
1 parent
6c77ca0
commit 353ae25
Showing
16 changed files
with
1,252 additions
and
4 deletions.
There are no files selected for viewing
208 changes: 208 additions & 0 deletions
208
mzLib/Readers/ExternalResults/IndividualResultRecords/ToppicPrsm.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
using System.Globalization; | ||
using System.Text; | ||
using System.Text.RegularExpressions; | ||
using CsvHelper.Configuration; | ||
using CsvHelper.Configuration.Attributes; | ||
using MassSpectrometry; | ||
using static System.Net.Mime.MediaTypeNames; | ||
|
||
namespace Readers; | ||
|
||
/// <summary> | ||
/// Class Representing a TopPIC prsm or proteoform | ||
/// For supported versions and software this file type can come from see | ||
/// Readers.ExternalResources.SupportedVersions.txt | ||
/// </summary> | ||
/// <remarks> | ||
/// Things that could be done to improve compatibility: | ||
/// Convert Variable Modifications to a list of Modification objects | ||
/// Convert NTerminalForm to a Modification object | ||
/// </remarks> | ||
public class ToppicPrsm | ||
{ | ||
[Ignore] | ||
public static CsvConfiguration CsvConfiguration => new CsvConfiguration(CultureInfo.InvariantCulture) | ||
{ | ||
Encoding = Encoding.UTF8, | ||
HasHeaderRecord = true, | ||
Delimiter = "\t", | ||
}; | ||
|
||
public ToppicPrsm() | ||
{ | ||
AlternativeIdentifications = new List<AlternativeToppicId>(); | ||
} | ||
|
||
private string? _fileNameWithoutExtension; | ||
[Ignore] | ||
public string FileNameWithoutExtension => _fileNameWithoutExtension ??= Path.GetFileNameWithoutExtension(FilePath); | ||
|
||
[Name("Data file name")] | ||
public string FilePath { get; set; } | ||
|
||
[Name("Prsm ID")] | ||
public int PrsmID { get; set; } | ||
|
||
[Name("Spectrum ID")] | ||
public int SpectrumId { get; set; } | ||
|
||
[Name("Fragmentation")] | ||
public DissociationType DissociationType { get; set; } | ||
|
||
[Name("Scan(s)")] | ||
public int OneBasedScanNumber { get; set; } | ||
|
||
[Name("Retention time")] | ||
public double RetentionTime { get; set; } | ||
|
||
[Name("#peaks")] | ||
public int PeakCount { get; set; } | ||
|
||
[Name("Charge")] | ||
public int PrecursorCharge { get; set; } | ||
|
||
[Name("Precursor mass")] | ||
public double PrecursorMass { get; set; } | ||
|
||
[Name("Adjusted precursor mass")] | ||
public double AdjustedPrecursorMass { get; set; } | ||
|
||
[Name("Proteoform ID")] | ||
public double ProteoformId { get; set; } | ||
|
||
[Name("Feature intensity")] | ||
[Format("#.00#E+00")] | ||
public double FeatureIntensity { get; set; } | ||
|
||
[Name("Feature score")] | ||
public double FeatureScore { get; set; } | ||
|
||
[Name("Feature apex time")] | ||
public double FeatureApexTime { get; set; } | ||
|
||
[Name("#Protein hits")] | ||
public int ProteinHitsCount { get; set; } | ||
|
||
[Name("Protein accession")] | ||
public string ProteinAccession { get; set; } | ||
|
||
[Name("Protein description")] | ||
public string ProteinDescription { get; set; } | ||
|
||
[Name("First residue")] | ||
public int FirstResidue { get; set; } | ||
|
||
[Name("Last residue")] | ||
public int LastResidue { get; set; } | ||
|
||
[Name("Special amino acids")] | ||
public string? SpecialAminoAcids { get; set; } | ||
|
||
[Ignore] | ||
private string? _baseSequence; | ||
|
||
[Optional] | ||
[Name("Database protein sequence")] | ||
public string BaseSequence | ||
{ | ||
get => _baseSequence ??= GetBaseSequenceFromFullSequence(); | ||
set => _baseSequence = value; | ||
} | ||
|
||
[Name("Proteoform")] | ||
public string FullSequence { get; set; } | ||
|
||
[Name("Proteoform mass")] | ||
public double FullSequenceMass { get; set; } | ||
|
||
[Name("Protein N-terminal form")] | ||
public string ProteinNTerminalForm { get; set; } | ||
|
||
[Optional] | ||
[Name("Fixed PTMs")] | ||
public string? FixedPTMs { get; set; } | ||
|
||
[Name("#unexpected modifications")] | ||
public int UnexpectedModificationsCount { get; set; } | ||
|
||
/// <summary> | ||
/// The mass shift of the mod and its semi-localization | ||
/// -47:[10-14] means a mass shift of -47 Da, and the semi-localization is between the 10th and 14th amino acids | ||
/// </summary> | ||
[Optional] | ||
[Name("unexpected modifications")] | ||
public string UnexpectedModifications { get; set; } | ||
|
||
[Name("#variable PTMs")] | ||
public int VariableModificationsCount { get; set; } | ||
|
||
[Optional] | ||
[Name("variable PTMs")] | ||
public string VariableModifications { get; set; } | ||
|
||
[Name("MIScore")] | ||
[TypeConverter(typeof(DashToNullOrDoubleConverter))] | ||
public double? MIScore { get; set; } | ||
|
||
[Name("#matched peaks")] | ||
public int MatchedPeaksCount { get; set; } | ||
|
||
[Name("#matched fragment ions")] | ||
public int MatchedFragmentIonsCount { get; set; } | ||
|
||
[Name("E-value")] | ||
[Format("0.00E+00")] | ||
public double EValue { get; set; } | ||
|
||
[Name("Spectrum-level Q-value")] | ||
[TypeConverter(typeof(DashToNullOrDoubleConverter))] | ||
public double? QValueSpectrumLevel { get; set; } | ||
|
||
[Name("Proteoform-level Q-value")] | ||
[TypeConverter(typeof(DashToNullOrDoubleConverter))] | ||
public double? QValueProteoformLevel { get; set; } | ||
|
||
[Ignore] | ||
public List<AlternativeToppicId> AlternativeIdentifications { get; set; } | ||
|
||
public string GetBaseSequenceFromFullSequence() | ||
{ | ||
// Remove text within square brackets | ||
var text = Regex.Replace(FullSequence, @"\[[^\]]*\]", ""); | ||
|
||
// Remove parentheses | ||
text = Regex.Replace(text, @"[()]", ""); | ||
|
||
// Remove periods | ||
text = Regex.Replace(text, @"(^[^.]+)|(\.[^.]+$)", "") | ||
.Replace(".",""); | ||
return text; | ||
} | ||
} | ||
|
||
/// <summary> | ||
/// Class representing an alternative Identification from the tsv file. | ||
/// </summary> | ||
public class AlternativeToppicId | ||
{ | ||
public int PrsmId { get; set; } | ||
public string Accession { get; set; } | ||
public string ProteinDescription { get; set; } | ||
public int FirstResidue { get; set; } | ||
public int LastResidue { get; set; } | ||
|
||
public AlternativeToppicId(int prsmId, string accession, string proteinDescription, int firstResidue, | ||
int lastResidue) | ||
{ | ||
PrsmId = prsmId; | ||
Accession = accession; | ||
ProteinDescription = proteinDescription; | ||
FirstResidue = firstResidue; | ||
LastResidue = lastResidue; | ||
} | ||
|
||
public override string ToString() | ||
{ | ||
return $"{PrsmId}\t\t\t\t\t\t\t\t\t\t\t\t\t\t{Accession}\t{ProteinDescription}\t{FirstResidue}\t{LastResidue}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"; | ||
} | ||
} |
Oops, something went wrong.