Skip to content

Commit

Permalink
Make hashing of rows for splitting independent of numpy version and s…
Browse files Browse the repository at this point in the history
…pectra columns
  • Loading branch information
ezander committed Oct 3, 2024
1 parent 02d4c21 commit 7f58137
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 9 deletions.
36 changes: 28 additions & 8 deletions mokapot/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,8 +421,8 @@ def _calibrate_scores(self, scores, eval_fdr, desc=True):
)


@typechecked
class OnDiskPsmDataset(PsmDataset):
@typechecked
def __init__(
self,
filename_or_reader: Path | TabularDataReader,
Expand Down Expand Up @@ -604,6 +604,32 @@ def update_labels(self, scores, target_column, eval_fdr=0.01, desc=True):
desc=desc,
)

@staticmethod
def _hash_row(x: np.ndarray) -> int:
"""
Hash array for splitting of test/training sets.
Parameters
----------
x : np.ndarray
Input array to be hashed.
Returns
-------
int
Computed hash of the input array.
"""

def to_base_val(v):
"""Return base python value also for numpy types"""
try:
return v.item()
except AttributeError:
return v

tup = tuple(to_base_val(x) for x in x)
return crc32(str(tup).encode())

def _split(self, folds, rng):
"""
Get the indices for random, even splits of the dataset.
Expand All @@ -626,13 +652,7 @@ def _split(self, folds, rng):
"""
spectra = self.spectra_dataframe[self.spectrum_columns].values
del self.spectra_dataframe
spectra = np.apply_along_axis(
# Need to cast to float, so that numpy 1.x and 2.x return the same
# string representation
lambda x: crc32(str(tuple(map(float, x))).encode()),
1,
spectra,
)
spectra = np.apply_along_axis(OnDiskPsmDataset._hash_row, 1, spectra)

# sort values to get start position of unique hashes
spectra_idx = np.argsort(spectra)
Expand Down
19 changes: 18 additions & 1 deletion tests/unit_tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

import numpy as np
import pandas as pd
from mokapot import LinearPsmDataset

from mokapot import LinearPsmDataset, OnDiskPsmDataset


def test_linear_init(psm_df_6):
Expand Down Expand Up @@ -56,3 +57,19 @@ def test_update_labels(psm_df_6):
real_labs = np.array([1, 1, 0, -1, -1, -1])
new_labs = dset._update_labels(scores, eval_fdr=0.5)
assert np.array_equal(real_labs, new_labs)


def test_hash_row():
x = np.array(["test.mzML", 870, 5902.639978936955, 890.522815122875], dtype=object)
assert OnDiskPsmDataset._hash_row(x) == 4196757312

x = np.array(
[
"test.mzML",
np.int64(870),
np.float64(5902.639978936955),
np.float64(890.522815122875),
],
dtype=object,
)
assert OnDiskPsmDataset._hash_row(x) == 4196757312

0 comments on commit 7f58137

Please sign in to comment.