From da2d545c816f8741e06fa78fb9ce8a1f7ff4abe3 Mon Sep 17 00:00:00 2001 From: Will Fondrie Date: Mon, 10 Apr 2023 16:12:51 -0700 Subject: [PATCH] Lint with Ruff and add other pre-commit hooks (#98) * Migrate to linting with Ruff * Bump changelog * Fix error from pandas update * Bump changelog * Remove debugging print statements * Fix lurking bug and improve test coverage * Bump changelog * Make suggested edits * Catch prints with Ruff * Use extend-exclude instead of exclude for Ruff --- .github/workflows/{black.yml => lint.yml} | 15 ++--- .pre-commit-config.yaml | 23 +++++-- CHANGELOG.md | 73 +++++++++++++---------- docs/source/conf.py | 2 +- mokapot/confidence.py | 4 +- mokapot/config.py | 4 +- mokapot/model.py | 1 - mokapot/mokapot.py | 2 +- mokapot/parsers/pepxml.py | 2 + mokapot/parsers/pin.py | 3 +- mokapot/peptides.py | 4 -- mokapot/picked_protein.py | 53 +++++++++++----- mokapot/qvalues.py | 1 - pyproject.toml | 10 +++- tests/system_tests/test_system.py | 6 +- tests/unit_tests/test_brew.py | 4 +- tests/unit_tests/test_confidence.py | 1 - tests/unit_tests/test_model.py | 2 - tests/unit_tests/test_parser_pepxml.py | 3 - tests/unit_tests/test_parser_pin.py | 2 +- tests/unit_tests/test_picked_protein.py | 17 ++++++ tests/unit_tests/test_writer_flashlfq.py | 3 +- tests/unit_tests/test_writer_txt.py | 3 +- 23 files changed, 146 insertions(+), 92 deletions(-) rename .github/workflows/{black.yml => lint.yml} (56%) diff --git a/.github/workflows/black.yml b/.github/workflows/lint.yml similarity index 56% rename from .github/workflows/black.yml rename to .github/workflows/lint.yml index 1856598d..47eadabf 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/lint.yml @@ -11,18 +11,19 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - name: Setup Python 3 + - name: Setup Python 3.10 uses: actions/setup-python@v4 with: python-version: "3.10" + - name: Install Ruff + run: | + python -m pip install --upgrade pip + pip install ruff + - name: Run black uses: psf/black@stable - - name: Check for debugging print statements + - name: Lint with Ruff run: | - if grep -rq "print(" mokapot; then - echo "Found the following print statements:" - grep -r "print(" mokapot - exit 1 - fi + ruff check . --format=github diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d3e5aceb..753caf42 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,19 @@ repos: - - repo: https://github.com/psf/black - rev: 23.1.0 # Replace by any tag/version: https://github.com/psf/black/tags - hooks: - - id: black - language_version: python3 # Should be a command that runs python3.6+ +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: check-toml + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + - id: detect-private-key +- repo: https://github.com/charliermarsh/ruff-pre-commit + rev: v0.0.261 + hooks: + - id: ruff + args: ['--fix'] +- repo: https://github.com/psf/black + rev: 23.3.0 + hooks: + - id: black + language_version: python3.10 diff --git a/CHANGELOG.md b/CHANGELOG.md index 90fff412..2a81e5f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,17 @@ -# Changelog for mokapot +# Changelog for mokapot ## [Unreleased] ### Breaking changes -- Mokapot now uses `numpy.random.Generator` instead of the deprecated `numpy.random.RandomState` API. +- Mokapot now uses `numpy.random.Generator` instead of the deprecated `numpy.random.RandomState` API. New `rng` arguments have been added to functions and classes that rely on randomness in lieu of setting a global random seed with `np.random.seed()`. Thanks @sjust-seerbio! +### Changed +- Added linting with Ruff to tests and pre-commit hooks (along with others)! + +### Fixed +- The PepXML reader, which broke due to a Pandas update. +- Potential bug if lowercase peptide sequences were used and protein-level confidence estimates were enabled + ## [0.9.1] - 2022-12-14 ### Changed - Cross-validation classes are now detected by looking for inheritance from the `sklearn.model_selection._search.BaseSearchCV` class. @@ -30,8 +37,8 @@ ## [0.8.2] - 2022-07-18 ### Added -- `mokapot.Model()` objects now recored the CV fold that they were fit on. - This means that they can be provided to `mokapot.brew()` in any order +- `mokapot.Model()` objects now recorded the CV fold that they were fit on. + This means that they can be provided to `mokapot.brew()` in any order and still maintain proper cross-validation bins. ### Fixed @@ -41,7 +48,7 @@ ## [0.8.1] - 2022-06-24 ### Added -- Support for previously trained models in the `brew()` function and the CLI +- Support for previously trained models in the `brew()` function and the CLI using the `--load_models` argument. Thanks @sambenfredj! ### Fixed @@ -51,18 +58,18 @@ ## [0.8.0] - 2022-03-11 -Thanks to @sambenfredj, @gessulat, @tkschmidt, and @MatthewThe for +Thanks to @sambenfredj, @gessulat, @tkschmidt, and @MatthewThe for PR #44, which made these things happen! ### Added - A new command line argument, `--max_workers`. This allows the cross-validation folds to be computed in parallel. -- The `PercolatorModel` class now has an `n_jobs` parameter, which +- The `PercolatorModel` class now has an `n_jobs` parameter, which controls parallelization of the grid search. ### Changes - Improved speed by using multiple jobs for grid search by default. -- Parallelization within `mokapot.brew()` now uses `joblib` +- Parallelization within `mokapot.brew()` now uses `joblib` instead of `concurrent.futures`. ## [0.7.4] - 2021-09-03 @@ -75,37 +82,37 @@ PR #44, which made these things happen! - Fixed bug where the `--keep_decoys` did not work with `--aggregate`. Also, added tests to cover this. Thanks @jspaezp! -## [0.7.2] - 2021-07-16 -### Added +## [0.7.2] - 2021-07-16 +### Added - `--keep_decoys` option to the command line interface. Thanks @jspaezp! - Notes about setting a random seed to the Python API documentation. (Issue #30) -- Added more information about peptides that couldn't be mapped to proteins. (Issue #29) +- Added more information about peptides that couldn't be mapped to proteins. (Issue #29) -### Fixed +### Fixed - Loading a saved model with `mokapot.load_model()` would fail because of an - update to Pandas that introduced a new exception. We've updated mokapot + update to Pandas that introduced a new exception. We've updated mokapot accordingly. -### Changed +### Changed - Updates to unit tests. Warnings are now treated as errors for system tests. -## [0.7.1] - 2021-03-22 -### Changed +## [0.7.1] - 2021-03-22 +### Changed - Updated the build to align with [PEP517](https://www.python.org/dev/peps/pep-0517/) -## [0.7.0] - 2021-03-19 -### Added +## [0.7.0] - 2021-03-19 +### Added - Support for downstream peptide and protein quantitation with [FlashLFQ](https://github.com/smith-chem-wisc/FlashLFQ). This is accomplished through the `mokapot.to_flashlfq()` function or the `to_flashlfq()` method of `LinearConfidence` objects. Note that to support the FlashLFQ format, you'll need to specify additional columns in `read_pin()` or use a PepXML input file - (`read_pepxml()`). + (`read_pepxml()`). - Added a top-level function for exporting confident PSMs, peptides, and proteins from one or more `LinearConfidence` objects as a tab-delimited file: `mokapot.to_txt()`. -- Added a top-level function for reading FASTA files for protein-level +- Added a top-level function for reading FASTA files for protein-level confidence estimates: `mokapot.read_fasta()`. - Tests accompanying the support for the features above. - Added a "mokapot cookbook" to the documentation with helpful code snippets. @@ -120,39 +127,39 @@ PR #44, which made these things happen! `importlib.metadata` to the standard library, saving a few hundred milliseconds. -## [0.6.2] - 2021-03-12 +## [0.6.2] - 2021-03-12 ### Added - Now checks to verify there are no debugging print statements in the code base when linting. -### Fixed +### Fixed - Removed debugging print statements. ## [0.6.1] - 2021-03-11 ### Fixed - Parsing Percolator tab-delimited files with a "DefaultDirection" line. -- `Label` column is now converted to boolean during PIN file parsing. +- `Label` column is now converted to boolean during PIN file parsing. Previously, problems occurred if the `Label` column was of dtype `object`. - Parsing modifications from pepXML files were indexed incorrectly on the peptide string. -## [0.6.0] - 2021-03-03 -### Added +## [0.6.0] - 2021-03-03 +### Added - Support for parsing PSMs from PepXML input files. - This changelog. -### Fixed -- Parsing a FASTA file previously failed if an entry was not followed by a - sequence. Now, missing sequences are tolerated and a warning is given instead. +### Fixed +- Parsing a FASTA file previously failed if an entry was not followed by a + sequence. Now, missing sequences are tolerated and a warning is given instead. - When the learned model was worse than the best feature and the lower scores - were better for the best feature, assigning confidence would fail. + were better for the best feature, assigning confidence would fail. - Easy access to grouped confidence estimates in the Python API were not working - due to a typo. -- Deprecation warnings from Pandas about the `regex` argument. + due to a typo. +- Deprecation warnings from Pandas about the `regex` argument. - Sometimes peptides were removed as shared incorrectly when part of a protein - group. + group. -### Changed +### Changed - Refactored and added many new unit and system tests. - New pull-requests must now improve or maintain test coverage. - Improved error messages. diff --git a/docs/source/conf.py b/docs/source/conf.py index 2025e114..245e9c03 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -27,7 +27,7 @@ copyright = "2022, William E. Fondrie" author = "William E. Fondrie" -import mokapot +import mokapot # noqa: E402 version = str(mokapot.__version__) release = version diff --git a/mokapot/confidence.py b/mokapot/confidence.py index a8578ef8..dec56614 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -17,9 +17,7 @@ """ import copy import logging -from pathlib import Path -import numpy as np import pandas as pd import matplotlib.pyplot as plt from triqler import qvality @@ -651,7 +649,7 @@ def plot_qvalues(qvalues, threshold=0.1, ax=None, **kwargs): ax.set_xlim(0 - xmargin, threshold + xmargin) ax.set_xlabel("q-value") - ax.set_ylabel(f"Discoveries") + ax.set_ylabel("Discoveries") ax.step(qvals["qvalue"].values, qvals.num.values, where="post", **kwargs) diff --git a/mokapot/config.py b/mokapot/config.py index cff445f0..e32819b8 100644 --- a/mokapot/config.py +++ b/mokapot/config.py @@ -12,8 +12,8 @@ class MokapotHelpFormatter(argparse.HelpFormatter): """Format help text to keep newlines and whitespace""" def _fill_text(self, text, width, indent): - text_list = text.splitlines(keepends=True) - return "\n".join(_process_line(l, width, indent) for l in text_list) + lines = text.splitlines(keepends=True) + return "\n".join(_process_line(txt, width, indent) for txt in lines) class Config: diff --git a/mokapot/model.py b/mokapot/model.py index 06484fbc..da50812a 100644 --- a/mokapot/model.py +++ b/mokapot/model.py @@ -16,7 +16,6 @@ import copy import logging import pickle -import warnings import numpy as np import pandas as pd diff --git a/mokapot/mokapot.py b/mokapot/mokapot.py index e4dd5291..760db931 100644 --- a/mokapot/mokapot.py +++ b/mokapot/mokapot.py @@ -123,7 +123,7 @@ def main(): model = list(plugin_models.values())[0] if model is None: - logging.debug(f"Loading Percolator model.") + logging.debug("Loading Percolator model.") model = PercolatorModel( train_fdr=config.train_fdr, max_iter=config.max_iter, diff --git a/mokapot/parsers/pepxml.py b/mokapot/parsers/pepxml.py index 4dfbee5f..6dfe4b28 100644 --- a/mokapot/parsers/pepxml.py +++ b/mokapot/parsers/pepxml.py @@ -332,6 +332,8 @@ def _log_features(col, features): """ if col.name not in features: return col + elif col.dtype == "bool": + return col.astype(float) col = col.astype(str).str.lower() diff --git a/mokapot/parsers/pin.py b/mokapot/parsers/pin.py index 8f988507..719f842d 100644 --- a/mokapot/parsers/pin.py +++ b/mokapot/parsers/pin.py @@ -4,7 +4,6 @@ import gzip import logging -import numpy as np import pandas as pd from .. import utils @@ -234,7 +233,7 @@ def _parse_in_chunks(file_obj, columns, chunk_size=int(1e8)): if not psms: break - psms = [l.rstrip().split("\t", len(columns) - 1) for l in psms] + psms = [p.rstrip().split("\t", len(columns) - 1) for p in psms] psms = pd.DataFrame.from_records(psms, columns=columns) yield psms.apply(pd.to_numeric, errors="ignore") diff --git a/mokapot/peptides.py b/mokapot/peptides.py index 09c7904e..eb9d5fd3 100644 --- a/mokapot/peptides.py +++ b/mokapot/peptides.py @@ -3,10 +3,6 @@ """ from collections import defaultdict -import numpy as np -import pandas as pd -import numba as nb - def match_decoy(decoys, targets, ignore_mods=True): """Find a corresponding target for each decoy. diff --git a/mokapot/picked_protein.py b/mokapot/picked_protein.py index ab2f7d59..9c97db80 100644 --- a/mokapot/picked_protein.py +++ b/mokapot/picked_protein.py @@ -3,7 +3,6 @@ confidence estimates. """ import logging -import numpy as np import pandas as pd from .peptides import match_decoy @@ -42,21 +41,7 @@ def picked_protein( columns={peptide_column: "best peptide"} ) - # Strip modifications and flanking AA's from peptide sequences. - prots["stripped sequence"] = ( - prots["best peptide"] - .str.replace(r"[\[\(].*?[\]\)]", "", regex=True) - .str.replace(r"^.*?\.", "", regex=True) - .str.replace(r"\..*?$", "", regex=True) - ) - - # Sometimes folks use lowercase letters for the termini or mods: - if all(prots["stripped sequence"].str.islower()): - seqs = prots["stripped sequence"].upper() - else: - seqs = prots["stripped sequence"].str.replace(r"[a-z]", "", regex=True) - - prots["stripped sequence"] = seqs + prots["stripped sequence"] = strip_peptides(prots["best peptide"]) # There are two cases we need to deal with: # 1. The fasta contained both targets and decoys (ideal) @@ -131,6 +116,42 @@ def picked_protein( return prots.loc[prot_idx, final_cols] +def strip_peptides(sequences): + """Strip modifications and flanking AA's from peptide sequences. + + Parameters + ---------- + sequences : pandas.Series + The peptide sequences. + + Returns + ------- + pandas.Series + The stripped peptide sequences. + + Example + ------- + >>> pep = pd.Series(["A.LES[+79.]LIEK.A"]) + >>> srip_peptides(pep) + 0 LESLIEK + dtype: object + """ + # Strip modifications and flanking AA's from peptide sequences. + sequences = ( + sequences.str.replace(r"[\[\(].*?[\]\)]", "", regex=True) + .str.replace(r"^.*?\.", "", regex=True) + .str.replace(r"\..*?$", "", regex=True) + ) + + # Sometimes folks use lowercase letters for the termini or mods: + if all(sequences.str.islower()): + sequences = sequences.str.upper() + else: + sequences = sequences.str.replace(r"[a-z]", "", regex=True) + + return sequences + + def group_with_decoys(peptides, proteins): """Retrieve the protein group in the case where the FASTA has decoys. diff --git a/mokapot/qvalues.py b/mokapot/qvalues.py index ad41a72c..a0e0066e 100644 --- a/mokapot/qvalues.py +++ b/mokapot/qvalues.py @@ -2,7 +2,6 @@ This module estimates q-values. """ import numpy as np -import pandas as pd import numba as nb diff --git a/pyproject.toml b/pyproject.toml index bb197a50..add3baf0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,9 +62,17 @@ find = {namespaces = false} [tool.setuptools_scm] +[tool.ruff] +select = ["E", "F", "T20"] # T20 is for print() statements. +extend-exclude = ["docs/source/conf.py"] + +[tool.ruff.per-file-ignores] +"__init__.py" = ["F401"] +"test_parser_pepxml.py" = ["E501"] + [tool.black] line-length = 79 -target-version = ['py37'] +target-version = ['py310'] include = '\.pyi?$' exclude = ''' diff --git a/tests/system_tests/test_system.py b/tests/system_tests/test_system.py index 9bc34d81..087cdd5f 100644 --- a/tests/system_tests/test_system.py +++ b/tests/system_tests/test_system.py @@ -24,10 +24,10 @@ def test_compare_to_percolator(): dat.add_proteins(os.path.join("data", "human_sp_td.fasta")) res, _ = mokapot.brew(dat) - perc_path = os.path.join("data", "percolator.{l}.txt") + perc_path = os.path.join("data", "percolator.{p}.txt") perc_res = { - l: mokapot.read_percolator(perc_path.format(l=l)) - for l in ["psms", "peptides", "proteins"] + p: mokapot.read_percolator(perc_path.format(p=p)) + for p in ["psms", "peptides", "proteins"] } for level in ["psms", "peptides", "proteins"]: diff --git a/tests/unit_tests/test_brew.py b/tests/unit_tests/test_brew.py index 5c9b0aa3..319626b1 100644 --- a/tests/unit_tests/test_brew.py +++ b/tests/unit_tests/test_brew.py @@ -111,7 +111,9 @@ def test_brew_trained_models(psms, svm): def test_brew_using_few_models_error(psms, svm): - """Test that if the number of trained models less than the number of folds we get the expected error message""" + """Test that if the number of trained models less than the number of + folds we get the expected error message. + """ with pytest.raises(ValueError) as err: mokapot.brew(psms, [svm, svm], test_fdr=0.05) assert ( diff --git a/tests/unit_tests/test_confidence.py b/tests/unit_tests/test_confidence.py index 63a6fc56..34994f07 100644 --- a/tests/unit_tests/test_confidence.py +++ b/tests/unit_tests/test_confidence.py @@ -1,7 +1,6 @@ """Test that Confidence classes are working correctly""" import pickle -import pytest import numpy as np import pandas as pd from mokapot import LinearPsmDataset diff --git a/tests/unit_tests/test_model.py b/tests/unit_tests/test_model.py index b90b43b3..252396ed 100644 --- a/tests/unit_tests/test_model.py +++ b/tests/unit_tests/test_model.py @@ -39,8 +39,6 @@ def test_model_init(): model = mokapot.Model(LogisticRegression()) assert isinstance(model.scaler, StandardScaler) - print(model) - def test_perc_init(): """Test the initialization of a PercolatorModel""" diff --git a/tests/unit_tests/test_parser_pepxml.py b/tests/unit_tests/test_parser_pepxml.py index 368b26cc..454412aa 100644 --- a/tests/unit_tests/test_parser_pepxml.py +++ b/tests/unit_tests/test_parser_pepxml.py @@ -2,7 +2,6 @@ import pytest import mokapot import numpy as np -from lxml import etree @pytest.fixture @@ -36,8 +35,6 @@ def test_pepxml_success(small_pepxml): def test_pepxml2df(small_pepxml): """Test that we can create a dataframe""" single = mokapot.read_pepxml(small_pepxml, decoy_prefix="rev_", to_df=True) - - print(single) assert len(single) == 4 assert len(single["scan"].unique()) == 2 np.testing.assert_array_equal(single["charge_2"], np.array([1, 1, 1, 0])) diff --git a/tests/unit_tests/test_parser_pin.py b/tests/unit_tests/test_parser_pin.py index bae0fd31..c8010bcd 100644 --- a/tests/unit_tests/test_parser_pin.py +++ b/tests/unit_tests/test_parser_pin.py @@ -34,4 +34,4 @@ def test_pin_parsing(std_pin): def test_pin_wo_dir(): """Test a PIN file without a DefaultDirection line""" - dat = mokapot.read_pin("data/scope2_FP97AA.pin") + mokapot.read_pin("data/scope2_FP97AA.pin") diff --git a/tests/unit_tests/test_picked_protein.py b/tests/unit_tests/test_picked_protein.py index e69de29b..f761a4dc 100644 --- a/tests/unit_tests/test_picked_protein.py +++ b/tests/unit_tests/test_picked_protein.py @@ -0,0 +1,17 @@ +"""Test the picked protein approach functions""" +import pandas as pd + +from mokapot.picked_protein import strip_peptides + + +def test_strip_peptides(): + """Test removing modifications from a peptide sequence""" + in_df = pd.Series(["A.B.C", "nABCc", "BL[+mod]AH", "A.B[1.1].C"]) + expected = pd.Series(["B", "ABC", "BLAH", "B"]) + out_df = strip_peptides(in_df) + pd.testing.assert_series_equal(out_df, expected) + + in_df = pd.Series(["abc"]) + expected = pd.Series(["ABC"]) + out_df = strip_peptides(in_df) + pd.testing.assert_series_equal(out_df, expected) diff --git a/tests/unit_tests/test_writer_flashlfq.py b/tests/unit_tests/test_writer_flashlfq.py index 968b4f2e..9aba9d70 100644 --- a/tests/unit_tests/test_writer_flashlfq.py +++ b/tests/unit_tests/test_writer_flashlfq.py @@ -1,5 +1,4 @@ """Test that FlashLFQ export is working""" -import copy import pytest import mokapot @@ -11,7 +10,7 @@ def test_sanity(psms, tmp_path): """Run simple sanity checks""" conf = psms.assign_confidence() test1 = conf.to_flashlfq(tmp_path / "test1.txt") - test2 = mokapot.to_flashlfq(conf, tmp_path / "test2.txt") + mokapot.to_flashlfq(conf, tmp_path / "test2.txt") test3 = mokapot.to_flashlfq([conf, conf], tmp_path / "test3.txt") with pytest.raises(ValueError): mokapot.to_flashlfq("blah", tmp_path / "test4.txt") diff --git a/tests/unit_tests/test_writer_txt.py b/tests/unit_tests/test_writer_txt.py index 14068f84..fea7f19a 100644 --- a/tests/unit_tests/test_writer_txt.py +++ b/tests/unit_tests/test_writer_txt.py @@ -3,7 +3,6 @@ import pytest import mokapot -import numpy as np import pandas as pd @@ -11,7 +10,7 @@ def test_sanity(psms, tmp_path): """Run simple sanity checks""" conf = psms.assign_confidence() test1 = conf.to_txt(dest_dir=tmp_path, file_root="test1") - test2 = mokapot.to_txt(conf, dest_dir=tmp_path, file_root="test2") + mokapot.to_txt(conf, dest_dir=tmp_path, file_root="test2") test3 = mokapot.to_txt([conf, conf], dest_dir=tmp_path, file_root="test3") with pytest.raises(ValueError): mokapot.to_txt("blah", dest_dir=tmp_path)