diff --git a/chemotools/baseline/air_pls.py b/chemotools/baseline/air_pls.py index ecfdeb8..ace4efd 100644 --- a/chemotools/baseline/air_pls.py +++ b/chemotools/baseline/air_pls.py @@ -132,14 +132,14 @@ def transform(self, X: np.ndarray, y=None) -> np.ndarray: return X_.reshape(-1, 1) if X_.ndim == 1 else X_ def _calculate_whittaker_smooth(self, x, w): - X = np.matrix(x) + X = np.array(x) m = X.size E = eye(m, format="csc") for i in range(self.polynomial_order): E = E[1:] - E[:-1] W = diags(w, 0, shape=(m, m)) - A = csc_matrix(W + (self.lam * E.T * E)) - B = csc_matrix(W * X.T) + A = csc_matrix(W + (self.lam * E.T @ E)) + B = csc_matrix(W @ X.T).toarray().ravel() background = spsolve(A, B) return np.array(background) diff --git a/chemotools/feature_selection/__init__.py b/chemotools/feature_selection/__init__.py new file mode 100644 index 0000000..9a69781 --- /dev/null +++ b/chemotools/feature_selection/__init__.py @@ -0,0 +1,2 @@ +from ._index_selector import IndexSelector +from ._range_cut import RangeCut diff --git a/chemotools/variable_selection/select_features.py b/chemotools/feature_selection/_index_selector.py similarity index 67% rename from chemotools/variable_selection/select_features.py rename to chemotools/feature_selection/_index_selector.py index 46781e7..3523c41 100644 --- a/chemotools/variable_selection/select_features.py +++ b/chemotools/feature_selection/_index_selector.py @@ -1,11 +1,13 @@ import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin +from sklearn.base import BaseEstimator +from sklearn.feature_selection._base import SelectorMixin + from sklearn.utils.validation import check_is_fitted from chemotools.utils.check_inputs import check_input -class SelectFeatures(OneToOneFeatureMixin, BaseEstimator, TransformerMixin): +class IndexSelector(BaseEstimator, SelectorMixin): """ A transformer that Selects the spectral data to a specified array of features. This array can be continuous or discontinuous. The array of features is specified by: @@ -52,7 +54,7 @@ def __init__( self.features = features self.wavenumbers = wavenumbers - def fit(self, X: np.ndarray, y=None) -> "SelectFeatures": + def fit(self, X: np.ndarray, y=None) -> "IndexSelector": """ Fit the transformer to the input data. @@ -66,14 +68,11 @@ def fit(self, X: np.ndarray, y=None) -> "SelectFeatures": Returns ------- - self : SelectFeatures + self : IndexSelector The fitted transformer. """ - # Check that X is a 2D array and has only finite values - X = check_input(X) - - # Set the number of features - self.n_features_in_ = X.shape[1] + # validate that X is a 2D array and has only finite values + X = self._validate_data(X) # Set the fitted attribute to True self._is_fitted = True @@ -91,41 +90,23 @@ def fit(self, X: np.ndarray, y=None) -> "SelectFeatures": return self - def transform(self, X: np.ndarray, y=None) -> np.ndarray: + def _get_support_mask(self): """ - Transform the input data by cutting it to the specified range. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The input data to transform. - - y : None - Ignored. + Get the boolean mask indicating which features are selected. Returns ------- - X_ : np.ndarray of shape (n_samples, n_features) - The transformed data. + mask : ndarray of shape (n_features,) + The mask indicating the selected features. """ # Check that the estimator is fitted - check_is_fitted(self, "_is_fitted") + check_is_fitted(self) - # Check that X is a 2D array and has only finite values - X = check_input(X) - X_ = X.copy() - - # Check that the number of features is the same as the fitted data - if X_.shape[1] != self.n_features_in_: - raise ValueError( - f"Expected {self.n_features_in_} features but got {X_.shape[1]}" - ) - - # Select the features - if self.features is None: - return X_ + # Create the mask + mask = np.zeros(self.n_features_in_, dtype=bool) + mask[self.features_index_] = True - return X_[:, self.features_index_] + return mask def _find_index(self, target: float) -> int: if self.wavenumbers is None: diff --git a/chemotools/variable_selection/range_cut.py b/chemotools/feature_selection/_range_cut.py similarity index 60% rename from chemotools/variable_selection/range_cut.py rename to chemotools/feature_selection/_range_cut.py index cf1f558..9ad63f5 100644 --- a/chemotools/variable_selection/range_cut.py +++ b/chemotools/feature_selection/_range_cut.py @@ -1,13 +1,12 @@ import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin +from sklearn.base import BaseEstimator +from sklearn.feature_selection._base import SelectorMixin from sklearn.utils.validation import check_is_fitted -from chemotools.utils.check_inputs import check_input - -class RangeCut(OneToOneFeatureMixin, BaseEstimator, TransformerMixin): +class RangeCut(BaseEstimator, SelectorMixin): """ - A transformer that cuts the input data to a specified range. The range is specified: + A selector that cuts the input data to a specified range. The range is specified: - by the indices of the start and end of the range, - by the wavenumbers of the start and end of the range. In this case, the wavenumbers must be provided to the transformer when it is initialised. If the wavenumbers @@ -35,19 +34,11 @@ class RangeCut(OneToOneFeatureMixin, BaseEstimator, TransformerMixin): end_index_ : int The index of the end of the range. It is -1 if the wavenumbers are not provided. - n_features_in_ : int - The number of features in the input data. - - _is_fitted : bool - Whether the transformer has been fitted to data. Methods ------- fit(X, y=None) Fit the transformer to the input data. - - transform(X, y=0, copy=True) - Transform the input data by cutting it to the specified range. """ def __init__( @@ -78,13 +69,7 @@ def fit(self, X: np.ndarray, y=None) -> "RangeCut": The fitted transformer. """ # Check that X is a 2D array and has only finite values - X = check_input(X) - - # Set the number of features - self.n_features_in_ = X.shape[1] - - # Set the fitted attribute to True - self._is_fitted = True + X = self._validate_data(X) # Set the start and end indices if self.wavenumbers is None: @@ -95,39 +80,25 @@ def fit(self, X: np.ndarray, y=None) -> "RangeCut": self.end_index_ = self._find_index(self.end) return self + - def transform(self, X: np.ndarray, y=None) -> np.ndarray: + def _get_support_mask(self): """ - Transform the input data by cutting it to the specified range. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The input data to transform. - - y : None - Ignored. + Get the boolean mask indicating which features are selected. Returns ------- - X_ : np.ndarray of shape (n_samples, n_features) - The transformed data. + mask : np.ndarray of shape (n_features,) + The boolean mask indicating which features are selected. """ # Check that the estimator is fitted - check_is_fitted(self, "_is_fitted") - - # Check that X is a 2D array and has only finite values - X = check_input(X) - X_ = X.copy() + check_is_fitted(self, ["start_index_", "end_index_"]) - # Check that the number of features is the same as the fitted data - if X_.shape[1] != self.n_features_in_: - raise ValueError( - f"Expected {self.n_features_in_} features but got {X_.shape[1]}" - ) + # Create the mask + mask = np.zeros(self.n_features_in_, dtype=bool) + mask[self.start_index_ : self.end_index_] = True - # Range cut the spectra - return X_[:, self.start_index_ : self.end_index_] + return mask def _find_index(self, target: float) -> int: wavenumbers = np.array(self.wavenumbers) diff --git a/chemotools/smooth/whittaker_smooth.py b/chemotools/smooth/whittaker_smooth.py index aeca968..1a64ec9 100644 --- a/chemotools/smooth/whittaker_smooth.py +++ b/chemotools/smooth/whittaker_smooth.py @@ -113,14 +113,14 @@ def transform(self, X: np.ndarray, y=None) -> np.ndarray: return X_.reshape(-1, 1) if X_.ndim == 1 else X_ def _calculate_whittaker_smooth(self, x): - X = np.matrix(x) + X = np.array(x) m = X.size E = eye(m, format="csc") w = np.ones(m) for i in range(self.differences): E = E[1:] - E[:-1] W = diags(w, 0, shape=(m, m)) - A = csc_matrix(W + (self.lam * E.T * E)) - B = csc_matrix(W * X.T) + A = csc_matrix(W + (self.lam * E.T @ E)) + B = csc_matrix(W @ X.T).toarray().ravel() background = spsolve(A, B) return np.array(background) diff --git a/chemotools/variable_selection/__init__.py b/chemotools/variable_selection/__init__.py deleted file mode 100644 index 99cb740..0000000 --- a/chemotools/variable_selection/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .range_cut import RangeCut -from .select_features import SelectFeatures \ No newline at end of file diff --git a/tests/test_functionality.py b/tests/test_functionality.py index a4f728a..653f5e2 100644 --- a/tests/test_functionality.py +++ b/tests/test_functionality.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd import pytest from chemotools.augmentation import ( @@ -27,7 +28,7 @@ StandardNormalVariate, ) from chemotools.smooth import MeanFilter, MedianFilter, WhittakerSmooth -from chemotools.variable_selection import RangeCut, SelectFeatures +from chemotools.feature_selection import IndexSelector, RangeCut from tests.fixtures import ( spectrum, spectrum_arpls, @@ -231,6 +232,77 @@ def test_extended_baseline_correction_through_msc_median(spectrum): # Assert assert np.allclose(spectrum_emsc[0], spectrum_msc, atol=1e-8) + + + +def test_index_selector(): + # Arrange + spectrum = np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]) + + # Act + select_features = IndexSelector() + spectrum_corrected = select_features.fit_transform(spectrum) + + # Assert + assert np.allclose(spectrum_corrected[0], spectrum[0], atol=1e-8) + + +def test_index_selector_with_index(): + # Arrange + spectrum = np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]) + expected = np.array([[1, 2, 3, 8, 9, 10]]) + + # Act + select_features = IndexSelector(features=np.array([0, 1, 2, 7, 8, 9])) + spectrum_corrected = select_features.fit_transform(spectrum) + + # Assert + assert np.allclose(spectrum_corrected[0], expected, atol=1e-8) + + +def test_index_selector_with_wavenumbers(): + # Arrange + wavenumbers = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]) + spectrum = np.array([[1.0, 2.0, 3.0, 5.0, 8.0, 13.0, 21.0, 34.0, 55.0, 89.0]]) + expected = np.array([[1.0, 2.0, 3.0, 34.0, 55.0, 89.0]]) + + # Act + select_features = IndexSelector( + features=np.array([1, 2, 3, 8, 9, 10]), wavenumbers=wavenumbers + ) + spectrum_corrected = select_features.fit_transform(spectrum) + + # Assert + assert np.allclose(spectrum_corrected[0], expected, atol=1e-8) + + +def test_index_selector_with_wavenumbers_and_dataframe(): + # Arrange + wavenumbers = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]) + spectrum = pd.DataFrame(np.array([[1.0, 2.0, 3.0, 5.0, 8.0, 13.0, 21.0, 34.0, 55.0, 89.0]])) + expected = np.array([[1.0, 2.0, 3.0, 34.0, 55.0, 89.0]]) + + # Act + select_features = IndexSelector( + features=np.array([1, 2, 3, 8, 9, 10]), wavenumbers=wavenumbers + ).set_output(transform='pandas') + + spectrum_corrected = select_features.fit_transform(spectrum) + + # Assert + assert type(spectrum_corrected) == pd.DataFrame + + +def test_index_shift(): + # Arrange + spectrum = np.array([[1, 1, 1, 1, 1, 2, 1, 1, 1, 1]]) + spectrum_shift = IndexShift(shift=1, random_state=42) + + # Act + spectrum_corrected = spectrum_shift.fit_transform(spectrum) + + # Assert + assert spectrum_corrected[0][4] == 2 def test_l1_norm(spectrum): @@ -539,7 +611,7 @@ def test_range_cut_by_wavenumber(): assert np.allclose(spectrum_corrected[0], spectrum[0][1:7], atol=1e-8) -def test_range_cut_by_wavenumber_2(): +def test_range_cut_by_wavenumber_with_list(): # Arrange wavenumbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] spectrum = np.array([[10, 12, 14, 16, 14, 12, 10, 12, 14, 16]]) @@ -552,6 +624,19 @@ def test_range_cut_by_wavenumber_2(): assert np.allclose(spectrum_corrected[0], spectrum[0][1:7], atol=1e-8) +def test_range_cut_by_wavenumber_with_dataframe(): + # Arrange + wavenumbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + spectrum = pd.DataFrame(np.array([[10, 12, 14, 16, 14, 12, 10, 12, 14, 16]])) + range_cut = RangeCut(start=2.5, end=7.9, wavenumbers=wavenumbers).set_output(transform='pandas') + + # Act + spectrum_corrected = range_cut.fit_transform(spectrum) + + # Assert + assert type(spectrum_corrected) == pd.DataFrame + + def test_robust_normal_variate(): # Arrange spectrum = np.array([2, 3.5, 5, 27, 8, 9]).reshape(1, -1) @@ -608,59 +693,6 @@ def test_saviszky_golay_filter_3(): assert np.allclose(spectrum_corrected[0], np.ones((1, 10)), atol=1e-2) -def test_select_features(): - # Arrange - spectrum = np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]) - - # Act - select_features = SelectFeatures() - spectrum_corrected = select_features.fit_transform(spectrum) - - # Assert - assert np.allclose(spectrum_corrected[0], spectrum[0], atol=1e-8) - - -def test_select_features_with_index(): - # Arrange - spectrum = np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]) - expected = np.array([[1, 2, 3, 8, 9, 10]]) - - # Act - select_features = SelectFeatures(features=np.array([0, 1, 2, 7, 8, 9])) - spectrum_corrected = select_features.fit_transform(spectrum) - - # Assert - assert np.allclose(spectrum_corrected[0], expected, atol=1e-8) - - -def test_select_features_with_wavenumbers(): - # Arrange - wavenumbers = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]) - spectrum = np.array([[1.0, 2.0, 3.0, 5.0, 8.0, 13.0, 21.0, 34.0, 55.0, 89.0]]) - expected = np.array([[1.0, 2.0, 3.0, 34.0, 55.0, 89.0]]) - - # Act - select_features = SelectFeatures( - features=np.array([1, 2, 3, 8, 9, 10]), wavenumbers=wavenumbers - ) - spectrum_corrected = select_features.fit_transform(spectrum) - - # Assert - assert np.allclose(spectrum_corrected[0], expected, atol=1e-8) - - -def test_index_shift(): - # Arrange - spectrum = np.array([[1, 1, 1, 1, 1, 2, 1, 1, 1, 1]]) - spectrum_shift = IndexShift(shift=1, random_state=42) - - # Act - spectrum_corrected = spectrum_shift.fit_transform(spectrum) - - # Assert - assert spectrum_corrected[0][4] == 2 - - def test_spectrum_scale(spectrum): # Arrange spectrum_scale = SpectrumScale(scale=0.01, random_state=42) diff --git a/tests/test_sklearn_compliance.py b/tests/test_sklearn_compliance.py index 45d4287..a4a192b 100644 --- a/tests/test_sklearn_compliance.py +++ b/tests/test_sklearn_compliance.py @@ -33,7 +33,7 @@ SavitzkyGolayFilter, WhittakerSmooth, ) -from chemotools.variable_selection import RangeCut, SelectFeatures +from chemotools.feature_selection import RangeCut, IndexSelector from tests.fixtures import spectrum @@ -94,6 +94,14 @@ def test_compliance_extended_multiplicative_scatter_correction(): check_estimator(transformer) +# IndexSelector +def test_compliance_index_selector(): + # Arrange + transformer = IndexSelector() + # Act & Assert + check_estimator(transformer) + + # IndexShift def test_compliance_spectrum_shift(): # Arrange @@ -197,6 +205,22 @@ def test_compliance_polynomial_correction(): check_estimator(transformer) +# RangeCut +def test_compliance_range_cut(): + # Arrange + transformer = RangeCut() + # Act & Assert + check_estimator(transformer) + + +# RobustNormalVariate +def test_compliance_robust_normal_variate(): + # Arrange + transformer = RobustNormalVariate() + # Act & Assert + check_estimator(transformer) + + # SavitzkyGolay def test_compliance_savitzky_golay(): # Arrange @@ -213,14 +237,6 @@ def test_compliance_savitzky_golay_filter(): check_estimator(transformer) -# SelectFeatures -def test_compliance_select_features(): - # Arrange - transformer = SelectFeatures() - # Act & Assert - check_estimator(transformer) - - # SpectrumScale def test_compliance_spectrum_scale(): # Arrange @@ -237,21 +253,6 @@ def test_compliance_standard_normal_variate(): check_estimator(transformer) -# RangeCut -def test_compliance_range_cut(): - # Arrange - transformer = RangeCut() - # Act & Assert - check_estimator(transformer) - - -# RobustNormalVariate -def test_compliance_robust_normal_variate(): - # Arrange - transformer = RobustNormalVariate() - # Act & Assert - check_estimator(transformer) - # SubtractReference def test_compliance_subtract_reference(): # Arrange