refactor: change stats to dataclass

wfondrie · Dec 13, 2024 · 6749f84 · 6749f84
1 parent 9841733
commit 6749f84
Show file tree

Hide file tree

Showing 3 changed files with 72 additions and 40 deletions.
diff --git a/mokapot/confidence.py b/mokapot/confidence.py
@@ -382,6 +382,7 @@ def assign_confidence(
 
     @typechecked
     def create_output_writer(path: Path, level: str, initialize: bool):
+        # Note: This method does not create a writer, it writes the data.
         if level == "proteins":
             output_columns = output_column_names_proteins
         else:

diff --git a/mokapot/statistics.py b/mokapot/statistics.py
@@ -3,54 +3,38 @@
 
 import numpy as np
 from typeguard import typechecked
+from dataclasses import dataclass
 
 SummaryStatistics = namedtuple(
     "SummaryStatistics", ("n", "min", "max", "sum", "mean", "var", "sd")
 )
 
 
 @typechecked
+@dataclass(slots=True)
 class OnlineStatistics:
-    """
-    @class Statistics:
-        A class for performing basic statistical calculations.
+    """A class for performing basic statistical calculations.
 
-    @attribute min:
+    Parameters
+    ----------
+    min : float
         The minimum value encountered so far. Initialized to positive infinity.
-
-    @attribute max:
+    max : float
         The maximum value encountered so far. Initialized to negative infinity.
-
-    @attribute n:
+    n : int
         The number of values encountered so far. Initialized to 0.
-
-    @attribute sum:
+    sum : float
         The sum of all values encountered so far. Initialized to 0.0.
-
-    @attribute mean:
-        The mean value calculated based on the encountered values. Initialized
-        to 0.0.
-
-    @attribute var:
-        The variance value calculated based on the encountered values.
+    mean : float
+        The mean value calculated based on the encountered values. Initialized to 0.0.
+    var : float
+        The variance value calculated based on the encountered values. Initialized to 0.0.
+    sd : float
+        The standard deviation value calculated based on the encountered values.
         Initialized to 0.0.
+    M2n : float
+        The intermediate value used in calculating variance. Initialized to 0.0.
 
-    @attribute sd:
-        The standard deviation value calculated based on the encountered
-        values. Initialized to 0.0.
-
-    @attribute M2n:
-        The intermediate value used in calculating variance. Initialized to
-        0.0.
-
-    @method update(vals: np.ndarray):
-        Updates the statistics with an array of values.
-
-    Args:
-        vals (np.ndarray): An array of values to update the statistics.
-
-    Returns:
-        None.
     """
 
     min: float = math.inf
@@ -61,6 +45,7 @@ class OnlineStatistics:
 
     M2n: float = 0.0
     ddof: float = 1.0
+    unbiased: bool = True
 
     @property
     def var(self) -> float:
@@ -70,13 +55,13 @@ def var(self) -> float:
     def sd(self) -> float:
         return math.sqrt(self.var)
 
-    def __init__(self, unbiased: bool = True):
-        if unbiased:
-            self.ddof = 1  # Use unbiased variance estimator
+    def __post_init__(self):
+        if self.unbiased:
+            # Use unbiased variance estimator
+            self.ddof = 1
         else:
-            self.ddof = (
-                0  # Use maximum likelihood (best L2) variance estimator
-            )
+            # Use maximum likelihood (best L2) variance estimator
+            self.ddof = 0
 
     def update(self, vals: np.ndarray) -> None:
         """

diff --git a/tests/unit_tests/test_writer_flashlfq.py b/tests/unit_tests/test_writer_flashlfq.py
@@ -18,9 +18,55 @@
 }
 
 
+def is_flashlfq_df(df):
+    """Check if the df is a valid FlashLFQ input.
+
+    https://github.com/smith-chem-wisc/FlashLFQ/wiki/Identification-Input-Formats
+
+    - File Name - With or without file extension (e.g. MyFile or MyFile.mzML)
+    - Base Sequence - Should only contain an amino acid sequence
+      (e.g., PEPTIDE and not PEPT[Phosphorylation]IDE
+    - Full Sequence - Modified sequence. Can contain any characters
+      (e.g., PEPT[Phosphorylation]IDE is fine), but must be consistent between
+      the same peptidoform to get accurate results
+    - Peptide Monoisotopic Mass - Theoretical monoisotopic mass,
+      including modification mass
+    - Scan Retention Time - MS/MS identification scan retention time in minutes
+    - Precursor Charge - Charge of the ion selected for MS/MS resulting in the
+      identification. Use the number only (e.g., "3" and not "+3")
+    - Protein Accession - Protein accession(s) for the peptide.
+      It is important to list all of the parent protein options
+      if you want the "shared peptides" to be accurate.
+      Use the semicolon (;) to delimit different proteins.
+    """
+    # File Name	Scan Retention Time	Precursor Charge	Base Sequence
+    # Full Sequence	Peptide Monoisotopic Mass	Protein Accession
+    EXPECTED_COLS = {
+        "File Name": str,
+        "Base Sequence": str,
+        "Full Sequence": str,
+        "Peptide Monoisotopic Mass": float,
+        "Scan Retention Time": float,
+        "Precursor Charge": int,
+        "Protein Accession": str,
+    }
+    for col, coltype in EXPECTED_COLS.items():
+        assert col in df.columns, f"Column {col} not found in input"
+        assert isinstance(
+            df[col].iloc[0], coltype
+        ), f"Column {col} is not {coltype}"
+
+    # Check that the base sequence matches the pattern [A-Z]+
+    assert (
+        df["Base Sequence"].str.match("[A-Z]+").all()
+    ), "Base sequence must only contain amino acids"
+
+    return True
+
+
 @pytest.fixture
 def flashlfq_psms_ds(psm_df_builder):
-    """A small OnDiskPsmDataset"""
+    """A small-ish PSM dataset"""
     data = psm_df_builder(1000, 1000, score_diffs=[5.0])
     psms = LinearPsmDataset(
         psms=data.df,