From 6749f84a4ca7d969198a68645b1aaa6af533ba89 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 13 Dec 2024 14:02:15 -0600 Subject: [PATCH] refactor: change stats to dataclass --- mokapot/confidence.py | 1 + mokapot/statistics.py | 63 +++++++++--------------- tests/unit_tests/test_writer_flashlfq.py | 48 +++++++++++++++++- 3 files changed, 72 insertions(+), 40 deletions(-) diff --git a/mokapot/confidence.py b/mokapot/confidence.py index b91d88b..a59119f 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -382,6 +382,7 @@ def assign_confidence( @typechecked def create_output_writer(path: Path, level: str, initialize: bool): + # Note: This method does not create a writer, it writes the data. if level == "proteins": output_columns = output_column_names_proteins else: diff --git a/mokapot/statistics.py b/mokapot/statistics.py index 771bd66..42c17b3 100644 --- a/mokapot/statistics.py +++ b/mokapot/statistics.py @@ -3,6 +3,7 @@ import numpy as np from typeguard import typechecked +from dataclasses import dataclass SummaryStatistics = namedtuple( "SummaryStatistics", ("n", "min", "max", "sum", "mean", "var", "sd") @@ -10,47 +11,30 @@ @typechecked +@dataclass(slots=True) class OnlineStatistics: - """ - @class Statistics: - A class for performing basic statistical calculations. + """A class for performing basic statistical calculations. - @attribute min: + Parameters + ---------- + min : float The minimum value encountered so far. Initialized to positive infinity. - - @attribute max: + max : float The maximum value encountered so far. Initialized to negative infinity. - - @attribute n: + n : int The number of values encountered so far. Initialized to 0. - - @attribute sum: + sum : float The sum of all values encountered so far. Initialized to 0.0. - - @attribute mean: - The mean value calculated based on the encountered values. Initialized - to 0.0. - - @attribute var: - The variance value calculated based on the encountered values. + mean : float + The mean value calculated based on the encountered values. Initialized to 0.0. + var : float + The variance value calculated based on the encountered values. Initialized to 0.0. + sd : float + The standard deviation value calculated based on the encountered values. Initialized to 0.0. + M2n : float + The intermediate value used in calculating variance. Initialized to 0.0. - @attribute sd: - The standard deviation value calculated based on the encountered - values. Initialized to 0.0. - - @attribute M2n: - The intermediate value used in calculating variance. Initialized to - 0.0. - - @method update(vals: np.ndarray): - Updates the statistics with an array of values. - - Args: - vals (np.ndarray): An array of values to update the statistics. - - Returns: - None. """ min: float = math.inf @@ -61,6 +45,7 @@ class OnlineStatistics: M2n: float = 0.0 ddof: float = 1.0 + unbiased: bool = True @property def var(self) -> float: @@ -70,13 +55,13 @@ def var(self) -> float: def sd(self) -> float: return math.sqrt(self.var) - def __init__(self, unbiased: bool = True): - if unbiased: - self.ddof = 1 # Use unbiased variance estimator + def __post_init__(self): + if self.unbiased: + # Use unbiased variance estimator + self.ddof = 1 else: - self.ddof = ( - 0 # Use maximum likelihood (best L2) variance estimator - ) + # Use maximum likelihood (best L2) variance estimator + self.ddof = 0 def update(self, vals: np.ndarray) -> None: """ diff --git a/tests/unit_tests/test_writer_flashlfq.py b/tests/unit_tests/test_writer_flashlfq.py index f8cc3fc..a417ba8 100644 --- a/tests/unit_tests/test_writer_flashlfq.py +++ b/tests/unit_tests/test_writer_flashlfq.py @@ -18,9 +18,55 @@ } +def is_flashlfq_df(df): + """Check if the df is a valid FlashLFQ input. + + https://github.com/smith-chem-wisc/FlashLFQ/wiki/Identification-Input-Formats + + - File Name - With or without file extension (e.g. MyFile or MyFile.mzML) + - Base Sequence - Should only contain an amino acid sequence + (e.g., PEPTIDE and not PEPT[Phosphorylation]IDE + - Full Sequence - Modified sequence. Can contain any characters + (e.g., PEPT[Phosphorylation]IDE is fine), but must be consistent between + the same peptidoform to get accurate results + - Peptide Monoisotopic Mass - Theoretical monoisotopic mass, + including modification mass + - Scan Retention Time - MS/MS identification scan retention time in minutes + - Precursor Charge - Charge of the ion selected for MS/MS resulting in the + identification. Use the number only (e.g., "3" and not "+3") + - Protein Accession - Protein accession(s) for the peptide. + It is important to list all of the parent protein options + if you want the "shared peptides" to be accurate. + Use the semicolon (;) to delimit different proteins. + """ + # File Name Scan Retention Time Precursor Charge Base Sequence + # Full Sequence Peptide Monoisotopic Mass Protein Accession + EXPECTED_COLS = { + "File Name": str, + "Base Sequence": str, + "Full Sequence": str, + "Peptide Monoisotopic Mass": float, + "Scan Retention Time": float, + "Precursor Charge": int, + "Protein Accession": str, + } + for col, coltype in EXPECTED_COLS.items(): + assert col in df.columns, f"Column {col} not found in input" + assert isinstance( + df[col].iloc[0], coltype + ), f"Column {col} is not {coltype}" + + # Check that the base sequence matches the pattern [A-Z]+ + assert ( + df["Base Sequence"].str.match("[A-Z]+").all() + ), "Base sequence must only contain amino acids" + + return True + + @pytest.fixture def flashlfq_psms_ds(psm_df_builder): - """A small OnDiskPsmDataset""" + """A small-ish PSM dataset""" data = psm_df_builder(1000, 1000, score_diffs=[5.0]) psms = LinearPsmDataset( psms=data.df,