Skip to content

Commit

Permalink
refactor: change stats to dataclass
Browse files Browse the repository at this point in the history
  • Loading branch information
jspaezp committed Dec 13, 2024
1 parent 9841733 commit 6749f84
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 40 deletions.
1 change: 1 addition & 0 deletions mokapot/confidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,7 @@ def assign_confidence(

@typechecked
def create_output_writer(path: Path, level: str, initialize: bool):
# Note: This method does not create a writer, it writes the data.
if level == "proteins":
output_columns = output_column_names_proteins
else:
Expand Down
63 changes: 24 additions & 39 deletions mokapot/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,54 +3,38 @@

import numpy as np
from typeguard import typechecked
from dataclasses import dataclass

SummaryStatistics = namedtuple(
"SummaryStatistics", ("n", "min", "max", "sum", "mean", "var", "sd")
)


@typechecked
@dataclass(slots=True)
class OnlineStatistics:
"""
@class Statistics:
A class for performing basic statistical calculations.
"""A class for performing basic statistical calculations.
@attribute min:
Parameters
----------
min : float
The minimum value encountered so far. Initialized to positive infinity.
@attribute max:
max : float
The maximum value encountered so far. Initialized to negative infinity.
@attribute n:
n : int
The number of values encountered so far. Initialized to 0.
@attribute sum:
sum : float
The sum of all values encountered so far. Initialized to 0.0.
@attribute mean:
The mean value calculated based on the encountered values. Initialized
to 0.0.
@attribute var:
The variance value calculated based on the encountered values.
mean : float
The mean value calculated based on the encountered values. Initialized to 0.0.

Check failure on line 29 in mokapot/statistics.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (E501)

mokapot/statistics.py:29:80: E501 Line too long (86 > 79)
var : float
The variance value calculated based on the encountered values. Initialized to 0.0.

Check failure on line 31 in mokapot/statistics.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (E501)

mokapot/statistics.py:31:80: E501 Line too long (90 > 79)
sd : float
The standard deviation value calculated based on the encountered values.

Check failure on line 33 in mokapot/statistics.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (E501)

mokapot/statistics.py:33:80: E501 Line too long (80 > 79)
Initialized to 0.0.
M2n : float
The intermediate value used in calculating variance. Initialized to 0.0.

Check failure on line 36 in mokapot/statistics.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (E501)

mokapot/statistics.py:36:80: E501 Line too long (80 > 79)
@attribute sd:
The standard deviation value calculated based on the encountered
values. Initialized to 0.0.
@attribute M2n:
The intermediate value used in calculating variance. Initialized to
0.0.
@method update(vals: np.ndarray):
Updates the statistics with an array of values.
Args:
vals (np.ndarray): An array of values to update the statistics.
Returns:
None.
"""

min: float = math.inf
Expand All @@ -61,6 +45,7 @@ class OnlineStatistics:

M2n: float = 0.0
ddof: float = 1.0
unbiased: bool = True

@property
def var(self) -> float:
Expand All @@ -70,13 +55,13 @@ def var(self) -> float:
def sd(self) -> float:
return math.sqrt(self.var)

def __init__(self, unbiased: bool = True):
if unbiased:
self.ddof = 1 # Use unbiased variance estimator
def __post_init__(self):
if self.unbiased:
# Use unbiased variance estimator
self.ddof = 1
else:
self.ddof = (
0 # Use maximum likelihood (best L2) variance estimator
)
# Use maximum likelihood (best L2) variance estimator
self.ddof = 0

def update(self, vals: np.ndarray) -> None:
"""
Expand Down
48 changes: 47 additions & 1 deletion tests/unit_tests/test_writer_flashlfq.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,55 @@
}


def is_flashlfq_df(df):
"""Check if the df is a valid FlashLFQ input.
https://github.com/smith-chem-wisc/FlashLFQ/wiki/Identification-Input-Formats
- File Name - With or without file extension (e.g. MyFile or MyFile.mzML)
- Base Sequence - Should only contain an amino acid sequence
(e.g., PEPTIDE and not PEPT[Phosphorylation]IDE
- Full Sequence - Modified sequence. Can contain any characters
(e.g., PEPT[Phosphorylation]IDE is fine), but must be consistent between
the same peptidoform to get accurate results
- Peptide Monoisotopic Mass - Theoretical monoisotopic mass,
including modification mass
- Scan Retention Time - MS/MS identification scan retention time in minutes
- Precursor Charge - Charge of the ion selected for MS/MS resulting in the
identification. Use the number only (e.g., "3" and not "+3")
- Protein Accession - Protein accession(s) for the peptide.
It is important to list all of the parent protein options
if you want the "shared peptides" to be accurate.
Use the semicolon (;) to delimit different proteins.
"""
# File Name Scan Retention Time Precursor Charge Base Sequence
# Full Sequence Peptide Monoisotopic Mass Protein Accession
EXPECTED_COLS = {
"File Name": str,
"Base Sequence": str,
"Full Sequence": str,
"Peptide Monoisotopic Mass": float,
"Scan Retention Time": float,
"Precursor Charge": int,
"Protein Accession": str,
}
for col, coltype in EXPECTED_COLS.items():
assert col in df.columns, f"Column {col} not found in input"
assert isinstance(
df[col].iloc[0], coltype
), f"Column {col} is not {coltype}"

# Check that the base sequence matches the pattern [A-Z]+
assert (
df["Base Sequence"].str.match("[A-Z]+").all()
), "Base sequence must only contain amino acids"

return True


@pytest.fixture
def flashlfq_psms_ds(psm_df_builder):
"""A small OnDiskPsmDataset"""
"""A small-ish PSM dataset"""
data = psm_df_builder(1000, 1000, score_diffs=[5.0])
psms = LinearPsmDataset(
psms=data.df,
Expand Down

0 comments on commit 6749f84

Please sign in to comment.