diff --git a/mokapot/__init__.py b/mokapot/__init__.py index 1c747103..4c12e895 100644 --- a/mokapot/__init__.py +++ b/mokapot/__init__.py @@ -2,6 +2,7 @@ from .confidence import PsmConfidence from .dataset import PsmDataset from .parsers.fasta import digest, make_decoys, read_fasta +from .parsers.pin import percolator_to_df, read_pin from .schema import PsmSchema from .version import _get_version from .writers import to_csv, to_flashlfq, to_parquet, to_txt diff --git a/mokapot/parsers/pin.py b/mokapot/parsers/pin.py index 11b3d0ff..c34ab523 100644 --- a/mokapot/parsers/pin.py +++ b/mokapot/parsers/pin.py @@ -1,183 +1,149 @@ -""" -This module contains the parsers for reading in PSMs -""" +"""Parsers for reading in PSMs.""" +import gzip import logging +from io import StringIO +from os import PathLike +from typing import IO, Iterable -import pandas as pd +import numpy as np import polars as pl from .. import utils -from ..dataset import LinearPsmDataset +from ..dataset import PsmDataset +from ..proteins import Proteins +from ..schema import PsmSchema LOGGER = logging.getLogger(__name__) def read_pin( - pin_files: str | tuple[str] | pl.DataFrame | pd.DataFrame, - group_column: str = None, - filename_column: str = None, - calcmass_column: str = None, - expmass_column: str = None, - rt_column: str = None, - charge_column: str = None, - to_df: bool = False, -) -> LinearPsmDataset | pl.LazyFrame: + pin_files: PathLike | pl.DataFrame | Iterable[PathLike | pl.DataFrame], + group: str = None, + file: str = None, + calcmass: str = None, + expmass: str = None, + ret_time: str = None, + charge: str = None, + proteins: Proteins | None = None, + eval_fdr: float = 0.01, + subset: int | None = None, + rng: int | np.random.Generator | None = None, + strict_parsing: bool = False, +) -> PsmDataset: """Read Percolator input (PIN) tab-delimited files. Read PSMs from one or more Percolator input (PIN) tab-delmited files, aggregating them into a single - :py:class:`~mokapot.dataset.LinearPsmDataset`. For more details about the + :py:class:`~mokapot.dataset.PsmDataset`. For more details about the PIN file format, see the `Percolator documentation `_. + This function also works on Parquet versions of PIN files. - Specifically, mokapot requires specific columns in the tab-delmited files: - `specid`, `scannr`, `peptide`, and `label`. Note that these - column names are case insensitive. In addition to these special columns - defined for the PIN format, mokapot also looks for additional columns that - specify the MS data file names, theoretical monoisotopic peptide masses, - the measured mass, retention times, and charge states, which are necessary - to create specific output formats for downstream tools, such as FlashLFQ. + Specifically, mokapot requires the following columns in the PIN files: + `specid`, `scannr`, `peptide`, and `label`. Note that these column names + are case-insensitive. In addition to these special columns that are defined + for the PIN format, mokapot also looks for additional columns that specify + the MS data file names, theoretical monoisotopic peptide masses, the + measured mass, retention times, and charge states, which are necessary to + create specific output formats for downstream tools, such as FlashLFQ. In addition to PIN tab-delimited files, the `pin_files` argument can be a - :py:class:`pl.DataFrame` :py:class:`pandas.DataFrame` containing the above - columns. + :py:class:`pl.DataFrame` or :py:class:`pandas.DataFrame` containing the + above columns. Finally, mokapot does not currently support specifying a default direction or feature weights in the PIN file itself. If these are present, they will be ignored. + Warning + ------- + The final column in the PIN format is a tab-delimited list of proteins. For + efficiency and because mokapot does not use this column for protein + inference, the default behavior is to truncate this column to the first + protein in each row. If this is not desired, use the `strict_parsing` + parameter. Note that parsing data in this manner will not allow for + memory-efficient data streaming that is normally used. + Parameters ---------- - pin_files : str, tuple of str, polars.DataFrame, or pandas.DataFrame + pin_files : PathLike, polars.DataFrame, iterable of either One or more PIN files or data frames to be read. Multiple files can be specified using globs, such as ``"psms_*.pin"`` or separately. - group_column : str, optional + group : str, optional A factor to by which to group PSMs for grouped confidence estimation. - filename_column : str, optional + file : str, optional The column specifying the MS data file. If :code:`None`, mokapot will look for a column called "filename" (case insensitive). This is required for some output formats, such as FlashLFQ. - calcmass_column : str, optional - The column specifying the theoretical monoisotopic mass of the peptide - including modifications. If :code:`None`, mokapot will look for a - column called "calcmass" (case insensitive). This is required for some - output formats, such as FlashLFQ. - expmass_column : str, optional - The column specifying the measured neutral precursor mass. If - :code:`None`, mokapot will look for a column call "expmass" (case - insensitive). This is required for some output formats. - rt_column : str, optional + ret_time : str, optional The column specifying the retention time in seconds. If :code:`None`, mokapot will look for a column called "ret_time" (case insensitive). This is required for some output formats, such as FlashLFQ. - charge_column : str, optional + charge : str, optional The column specifying the charge state of each peptide. If :code:`None`, mokapot will look for a column called "charge" (case insensitive). This is required for some output formats, such as FlashLFQ. - to_df : bool, optional - Return a :py:class:`pandas.DataFrame` instead of a - :py:class:`~mokapot.dataset.LinearPsmDataset`. + proteins : mokapot.Proteins, optional + The proteins to use for protein-level confidence estimation. This + may be created with :py:func:`mokapot.read_fasta()`. + eval_fdr : float, optional + The false discovery rate threshold for choosing the best feature and + creating positive labels during the trainging procedure. + subset: int, optional + The maximum number of examples to use for training. + rng : int or np.random.Generator, optional + A seed or generator used for cross-validation split creation and to + break ties, or :code:`None` to use the default random number + generator state. + strict_parsing : bool, optional + If `True`, the fast, memory-efficient parser is replaced by a slower + less efficient parser that correctly captures the PIN file protein + column. This is generally not recommended. Returns ------- - LinearPsmDataset or polars.LazyFrame - A :py:class:`~mokapot.dataset.LinearPsmDataset` object containing the - PSMs from all of the PIN files. + PsmDataset + The PSMs from all of the PIN files. """ logging.info("Parsing PSMs...") - - # Figure out the type of the input... - if isinstance(pin_files, pd.DataFrame): - pin_df = pl.from_pandas(pin_files).lazy() - elif isinstance(pin_files, pl.DataFrame): - pin_df = pin_files.lazy() - elif isinstance(pin_files, pl.LazyFrame): - pin_df = pin_files - else: - [pl.scan_csv(f, sep="\t") for f in utils.tuplize(pin_files)] - pin_df = pl.concat(pin_files, how="diagonal") - - # Find all of the necessary columns, case-insensitive: - specid = [c for c in pin_df.columns if c.lower() == "specid"] - peptides = [c for c in pin_df.columns if c.lower() == "peptide"][0] - labels = [c for c in pin_df.columns if c.lower() == "label"][0] - scan = [c for c in pin_df.columns if c.lower() == "scannr"][0] - - # Optional columns - filename = _check_column(filename_column, pin_df, "filename") - calcmass = _check_column(calcmass_column, pin_df, "calcmass") - expmass = _check_column(expmass_column, pin_df, "expmass") - ret_time = _check_column(rt_column, pin_df, "ret_time") - charge = _check_column(charge_column, pin_df, "charge_column") - spectra = [c for c in [filename, scan, ret_time, expmass] if c is not None] - - try: - proteins = [c for c in pin_df.columns if c.lower() == "proteins"][0] - except IndexError: - proteins = None - - nonfeat = [*specid, scan, peptides, proteins, labels] - - # Only add charge to features if there aren't other charge columns: - alt_charge = [c for c in pin_df.columns if c.lower().startswith("charge")] - if charge is not None and len(alt_charge) > 1: - nonfeat.append(charge) - - # Add the grouping column - if group_column is not None: - nonfeat += [group_column] - if group_column not in pin_df.columns: - raise ValueError(f"The '{group_column} column was not found.") - - for col in [filename, calcmass, expmass, ret_time]: - if col is not None: - nonfeat.append(col) - - features = [c for c in pin_df.columns if c not in nonfeat] - - # Check for errors: - if not all([specid, peptides, labels, spectra]): - raise ValueError( - "This PIN format is incompatible with mokapot. Please" - " verify that the required columns are present." - ) - - # Convert labels to the correct format. - pin_df = pin_df.with_columns( - pl.when(pl.col("Label") == 1) - .then(True) - .otherwise(False) - .alias("Label") + data = build_df(pin_files) + prot_col = find_column(None, data, "proteins", False) + + schema = PsmSchema( + target=find_column(None, data, "label", True), + spectrum=[ + find_column(None, data, "specid", True), + find_column(None, data, "scannr", True), + ], + peptide=find_column(None, data, "peptide", True), + file=find_column(file, data, "filename", False), + expmass=find_column(expmass, data, "expmass", False), + calcmass=find_column(calcmass, data, "calcmass", False), + ret_time=find_column(ret_time, data, "ret_time", False), + charge=find_column(charge, data, "charge", False), + group=find_column(group, data, None, False), + metadata=prot_col, ) - if to_df: - return pin_df - - return LinearPsmDataset( - psms=pin_df, - target_column=labels, - spectrum_columns=spectra, - peptide_column=peptides, - protein_column=proteins, - group_column=group_column, - feature_columns=features, - filename_column=filename, - scan_column=scan, - calcmass_column=calcmass, - expmass_column=expmass, - rt_column=ret_time, - charge_column=charge, - copy_data=False, + if schema.expmass is not None: + schema.spectrum.append(schema.expmass) + + return PsmDataset( + data=data, + schema=schema, + eval_fdr=eval_fdr, + proteins=proteins, + subset=subset, + rng=rng, ) -def read_percolator(perc_file): - """ - Read a Percolator tab-delimited file. +def percolator_to_df(perc_file: PathLike) -> pl.DataFrame: + """Read a Percolator tab-delimited file. Percolator input format (PIN) files and the Percolator result files are tab-delimited, but also have a tab-delimited protein list as the @@ -185,7 +151,7 @@ def read_percolator(perc_file): Parameters ---------- - perc_file : str + perc_file : PathLike The file to parse. Returns @@ -206,14 +172,16 @@ def read_percolator(perc_file): perc.seek(0) _ = perc.readline() - psms = pd.concat((c for c in _parse_in_chunks(perc, cols)), copy=False) + psms = pl.concat(_parse_in_chunks(perc, cols), how="vertical") return psms -def _parse_in_chunks(file_obj, columns, chunk_size=int(1e8)): +def _parse_in_chunks( + file_obj: IO, columns: list[str], chunk_size: int = int(1e8) +) -> pl.DataFrame: """ - Parse a file in chunks + Parse a file in chunks. Parameters ---------- @@ -234,20 +202,88 @@ def _parse_in_chunks(file_obj, columns, chunk_size=int(1e8)): if not psms: break - psms = [p.rstrip().split("\t", len(columns) - 1) for p in psms] - psms = pd.DataFrame.from_records(psms, columns=columns) - yield psms.apply(pd.to_numeric, errors="ignore") + psms = [ + ",".join(p.rstrip().split("\t", len(columns) - 1)) for p in psms + ] + psms = [",".join(columns)] + psms -def _check_column(col, df, default): - """Check that a column exists in the dataframe.""" + yield pl.read_csv(StringIO("\n".join(psms))) + + +def find_column( + col: str | None, + df: pl.LazyFrame, + default: str | None, + required: bool, +) -> str: + """Check that a column exists in the dataframe. + + Parameters + ---------- + col : str + The specified column that should be int he dataframe. + df : polars.DataFrame + The dataframe. + default : str + A case-insensitive fall-back option for the column. + required : bool + Is this column required? + """ if col is None: try: return [c for c in df.columns if c.lower() == default][0] - except IndexError: - return None + except (IndexError, TypeError): + if not required: + return None - if col not in df.columns: + if col not in df.columns and required: raise ValueError(f"The '{col}' column was not found.") return col + + +def build_df( + pin_files: str | pl.DataFrame | Iterable[str | pl.DataFrame], +) -> pl.LazyFrame: + """Build the PIN DataFrame. + + Parameters + ---------- + pin_files : str, polars.DataFrame, iterable of str or polars.DataFrame + One or more PIN files or data frames to be read. + + Returns + ------- + polars.LazyFrame + The parsed data. + """ + dfs = [] + for pin_file in utils.listify(pin_files): + # Figure out the type of the input + try: + pin_file = pl.from_pandas(pin_file) + except TypeError: + pass + + try: + dfs.append(pin_file.lazy()) + except AttributeError: + try: + dfs.append(pl.scan_parquet(pin_file)) + except pl.ArrowError: + dfs.append( + pl.scan_csv( + pin_file, + separator="\t", + truncate_ragged_lines=True, + ) + ) + + # Verify columns are identical: + first_cols = set(dfs[0].columns) + for df in dfs: + if set(df.columns) != first_cols: + raise ValueError("All pin_files must have the same columns") + + return pl.concat(dfs, how="vertical") diff --git a/mokapot/schema.py b/mokapot/schema.py index ce1793d2..dc38dfca 100644 --- a/mokapot/schema.py +++ b/mokapot/schema.py @@ -26,7 +26,7 @@ class _SchemaValidatorMixin: def __init__( self, required: list[str], single: list[str], variadic: list[str] ) -> None: - """Initialize the ColumnValidator""" + """Initialize the ColumnValidator.""" self._required = required self._single = single self._variadic = variadic @@ -209,7 +209,7 @@ class PsmSchema(_SchemaValidatorMixin): The column to use as the default score for ranking PSMs. desc : bool, optional Indicates that higher scores in the score column are better. This is - omly relevant if a score column is specified. + only relevant if a score column is specified. """ target: str @@ -227,7 +227,7 @@ class PsmSchema(_SchemaValidatorMixin): score: str | None = None desc: bool = True - def __post_init__(self): + def __post_init__(self) -> None: """Perform initial validation.""" super().__init__( required=["target", "spectrum", "peptide"], diff --git a/mokapot/utils.py b/mokapot/utils.py index 5cbdc4b3..5101c6fc 100644 --- a/mokapot/utils.py +++ b/mokapot/utils.py @@ -1,54 +1,29 @@ """Utility functions.""" -import itertools +from typing import Any -import numpy as np import polars as pl -def flatten(split): - """Get the indices from split.""" - return list(itertools.chain.from_iterable(split)) - - -def safe_divide(numerator, denominator, ones=False): - """Divide ignoring div by zero warnings.""" - if isinstance(numerator, pl.Series): - numerator = numerator.values - - if isinstance(denominator, pl.Series): - denominator = denominator.values - - numerator = numerator.astype(float) - denominator = denominator.astype(float) - if ones: - out = np.ones_like(numerator) - else: - out = np.zeros_like(numerator) - - return np.divide(numerator, denominator, out=out, where=(denominator != 0)) - - -def tuplize(obj): - """Convert obj to a tuple, without splitting strings.""" - try: - _ = iter(obj) - except TypeError: - obj = (obj,) - else: - if isinstance(obj, str): - obj = (obj,) - - return tuple(obj) +def listify(obj: Any) -> list[Any]: + """Convert obj to a list, without splitting strings or dataframes. + Parameters + ---------- + obj : anything + The object to turn into a list. -def listify(obj): - """Convert obj to a list, without splitting strings.""" + Returns + ------- + list + The list representation of th object. + """ try: _ = iter(obj) except TypeError: obj = [obj] else: - if isinstance(obj, str): + # Don't listify strings or DataFrames. + if isinstance(obj, str) or hasattr(obj, "columns"): obj = [obj] return list(obj) diff --git a/pyproject.toml b/pyproject.toml index 956b145f..ecdaaa7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,7 +72,8 @@ select = ["E", "F", "W", "C", "I", "D", "UP", "N", "ANN", "T20", "I001"] # D100 Missing docstring in public module # ANN102 Missing type annotation for `cls` in classmethod # D401 First line of docstring should be in imperative mood -ignore = ["D213", "ANN101", "D203", "D100", "ANN102", "D401"] +# ANN401 Dynamically typed expressions (typing.Any) are disallowed +ignore = ["D213", "ANN101", "D203", "D100", "ANN102", "D401", "ANN401"] fix = true line-length = 79 diff --git a/tests/unit_tests/test_parser_pin.py b/tests/unit_tests/test_parser_pin.py index c8010bcd..0d508033 100644 --- a/tests/unit_tests/test_parser_pin.py +++ b/tests/unit_tests/test_parser_pin.py @@ -1,12 +1,27 @@ -"""Test that parsing Percolator input files works correctly""" +"""Test that parsing Percolator input files works correctly.""" import pytest + import mokapot -import pandas as pd @pytest.fixture def std_pin(tmp_path): - """Create a standard pin file""" + """Create a standard pin file.""" + out_file = tmp_path / "std_pin" + with open(str(out_file), "w+") as pin: + dat = ( + "sPeCid\tLaBel\tgroup\tpepTide\tsCore\tscanNR\tpRoteins\n" + "a\t1\t1\tABC\t5\t2\tprotein1\tprotein2\n" + "b\t-1\t1\tCBA\t10\t3\tdecoy_protein1\tdecoy_protein2" + ) + pin.write(dat) + + return out_file + + +@pytest.fixture +def dir_pin(tmp_path): + """Create a standard pin file.""" out_file = tmp_path / "std_pin" with open(str(out_file), "w+") as pin: dat = ( @@ -20,18 +35,23 @@ def std_pin(tmp_path): return out_file -def test_pin_parsing(std_pin): - """Test pin parsing""" - df = mokapot.read_pin(std_pin, to_df=True) - assert df["LaBel"].dtype == "bool" +def test_perc_parsing(dir_pin): + """Test pin parsing.""" + df = mokapot.percolator_to_df(dir_pin) assert len(df) == 2 - assert len(df[df["LaBel"]]) == 1 - assert len(df[df["LaBel"]]) == 1 + assert df["pRoteins"][0] == "protein1\tprotein2" + - dat = mokapot.read_pin(std_pin) - pd.testing.assert_frame_equal(df.loc[:, ("sCore",)], dat.features) +def test_std_pin(std_pin): + """Test a PIN file without a DefaultDirection line.""" + dset = mokapot.read_pin(std_pin, group="group") + assert len(dset) == 2 + assert dset.schema.features == ["sCore"] + df = mokapot.percolator_to_df(std_pin) + assert len(df) == 2 + assert df["pRoteins"][0] == "protein1\tprotein2" -def test_pin_wo_dir(): - """Test a PIN file without a DefaultDirection line""" - mokapot.read_pin("data/scope2_FP97AA.pin") + dset = mokapot.read_pin(dset.data.collect(), group="group") + assert len(dset) == 2 + assert dset.schema.features == ["sCore"] diff --git a/tests/unit_tests/test_utils.py b/tests/unit_tests/test_utils.py index eeec9add..53b72d91 100644 --- a/tests/unit_tests/test_utils.py +++ b/tests/unit_tests/test_utils.py @@ -1,103 +1,36 @@ -"""Test the utility functions""" -import numpy as np -import pandas as pd +"""Test the utility functions.""" +import polars as pl import pytest +from polars.testing import assert_frame_equal from mokapot import utils - -@pytest.fixture -def df(): - """Create a simple dataframe.""" - data = { - "val1": [1, 2, 2, 1, 1, 1, 3, 2, 1], - "val2": [1] * 9, - "group": ["A"] * 3 + ["B"] * 3 + ["C"] * 3, - } - - max_res = {"val1": [2, 1, 3], "val2": [1, 1, 1], "group": ["A", "B", "C"]} - - return pd.DataFrame(data), pd.DataFrame(max_res) - - -def test_groupby_max(df): - """Test that the groupby_max() function works""" - in_df, val1_max = df - - # Verify that the basic idea works: - idx = utils.groupby_max(in_df, "group", "val1", 42) - out_df = in_df.loc[idx, :].sort_values("group").reset_index(drop=True) - pd.testing.assert_frame_equal(val1_max, out_df) - - idx = utils.groupby_max(in_df, ["group"], "val1", 42) - out_df = in_df.loc[idx, :].sort_values("group").reset_index(drop=True) - pd.testing.assert_frame_equal(val1_max, out_df) - - # Verify that the shuffling bit works: - idx1 = set(utils.groupby_max(in_df, "group", "val2", 2)) - idx2 = set(utils.groupby_max(in_df, "group", "val2", 1)) - assert idx1 != idx2 - - -def test_flatten(): - """Test that the flatten() function works""" - nested_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - flat_list = [1, 2, 3, 4, 5, 6, 7, 8, 9] - assert utils.flatten(nested_list) == flat_list - - nested_array = np.array(nested_list) - flat_array = np.array(flat_list) - np.testing.assert_array_equal( - np.array(utils.flatten(nested_array)), flat_array - ) - - -def test_safe_divide(): - """Test that our safe division function works""" - np_num = np.array([1, 2, 3, 4]) - np_den = np.array([1, 2, 3, 0.0]) - np_out = np.array([1, 1, 1, 0]) - np.testing.assert_array_equal(utils.safe_divide(np_num, np_den), np_out) - - np_out_ones = np.array([1, 1, 1, 1]) - np.testing.assert_array_equal( - utils.safe_divide(np_num, np_den, ones=True), np_out_ones - ) - - # Test pandas: - df = pd.DataFrame({"num": np_num, "den": np_den}) - np.testing.assert_array_equal(utils.safe_divide(df.num, df.den), np_out) - - np.testing.assert_array_equal( - utils.safe_divide(df.num, df.den, ones=True), np_out_ones - ) - - -def test_tuplize(): - """Test that we can turn things into tuples""" - list_in = [1, 2, 3] - list_out = (1, 2, 3) - assert utils.tuplize(list_in) == list_out - - str_in = "blah" - str_out = ("blah",) - assert utils.tuplize(str_in) == str_out - - tuple_in = ("blah", 1, "x") - tuple_out = ("blah", 1, "x") - assert utils.tuplize(tuple_in) == tuple_out - - -def test_listify(): - """Test that we can turn things into lists""" - list_in = [1, 2, 3] - list_out = [1, 2, 3] - assert utils.listify(list_in) == list_out - - str_in = "blah" - str_out = ["blah"] - assert utils.listify(str_in) == str_out - - tuple_in = ("blah", 1, "x") - tuple_out = ["blah", 1, "x"] - assert utils.listify(tuple_in) == tuple_out +DF = pl.DataFrame({"a": [1.0, 2], "b": [3.0, 4]}) + + +@pytest.mark.parametrize( + ("obj", "expected"), + [ + ([1, 2, 3], [1, 2, 3]), + ("blah", ["blah"]), + (DF, [DF]), + ], +) +def test_listify(obj, expected): + """Test that we can turn things into lists.""" + assert utils.listify(obj) == expected + + +@pytest.mark.parametrize( + ("obj", "expected"), + [ + (DF, DF.lazy()), + (DF.lazy(), DF.lazy()), + ({"a": [1.0, 2], "b": [3.0, 4]}, DF.lazy()), + ], +) +def test_make_lazy(obj, expected): + """Test that we can turn things into lists.""" + out = utils.make_lazy(obj) + assert isinstance(out, pl.LazyFrame) + assert_frame_equal(out, expected)