Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: implement ResidualizeTarget #11

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions confounds/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,7 @@
from ._version import get_versions
__version__ = get_versions()['version']
del get_versions


from .tests import (test_residualize_linear,
test_residualize_targets_linear)
105 changes: 103 additions & 2 deletions confounds/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,12 +293,113 @@ class ResidualizeTarget(BaseDeconfound):
"""


def __init__(self):
def __init__(self, model='linear'):
"""Constructor"""

super().__init__(name='ResidualizeTarget')

raise NotImplementedError()
self.model = model


def fit(self,
X, # variable names chosen to correspond to sklearn when possible
y=None, # y is the confound variables here, not the target!
):
"""
Fits the residualizing model (estimates the contributions of confounding
variables (y) to the given [training] target set X. Variable names X,
y had to be used to pass sklearn conventions. y here refers to the
confound variables. See examples in docs!

Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_targets)
The training input samples.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make it clear X is targets here, although it is called X

y : ndarray
Array of covariates, shape (n_samples, n_covariates)
This does not refer to target as is typical in scikit-learn.

Returns
-------
self : object
Returns self
"""

return self._fit(X, y) # which itself must return self


def _fit(self, in_targets, confounds=None):
"""Actual fit method"""

in_targets = check_array(in_targets)
confounds = check_array(confounds, ensure_2d=False)

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add checks for data type to ensure it is numeric

# turning it into 2D, in case if its just a column
if confounds.ndim == 1:
confounds = confounds[:, np.newaxis]

try:
check_consistent_length(in_targets, confounds)
except:
raise ValueError('X (targets) and y (confounds) '
'must have the same number of rows/samplets!')

self.n_targets_ = in_targets.shape[1]

regr_model = clone(get_model(self.model))
regr_model.fit(confounds, in_targets)
self.model_ = regr_model

return self


def transform(self, X, y=None):
"""
Transforms the given target set by residualizing the [test] targets
by subtracting the contributions of their confounding variables.

Variable names X, y had to be used to pass scikit-learn conventions. y here
refers to the confound variables for the [test] to be transformed.
See examples in docs!

Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_targets)
The training input samples.
y : ndarray
Array of covariates, shape (n_samples, n_covariates)
This does not refer to target as is typical in scikit-learn.

Returns
-------
self : object
Returns self
"""

return self._transform(X, y)


def _transform(self, test_targets, test_confounds):
"""Actual deconfounding of the test targets"""

check_is_fitted(self, 'model_', 'n_targets_')
test_targets = check_array(test_targets, accept_sparse=True)

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add another data type check

if test_targets.shape[1] != self.n_targets_:
raise ValueError('number of targets must be {}. Given {}'
''.format(self.n_targets_, test_targets.shape[1]))

if test_confounds is None: # during estimator checks
return test_targets # do nothing

test_confounds = check_array(test_confounds, ensure_2d=False)
check_consistent_length(test_targets, test_confounds)

# test targets as can be explained/predicted by their covariates
test_target_predicted = self.model_.predict(test_confounds)
residuals = test_targets - test_target_predicted

return residuals


class DummyDeconfounding(BaseDeconfound):
Expand Down
24 changes: 23 additions & 1 deletion confounds/tests/test_confounds.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from sklearn.datasets import make_classification, make_sparse_uncorrelated
from sklearn.utils.estimator_checks import check_estimator

from confounds.base import Augment, DummyDeconfounding, Residualize
from confounds.base import (Augment, DummyDeconfounding,
Residualize, ResidualizeTarget)


def test_estimator_API():
Expand Down Expand Up @@ -62,6 +63,27 @@ def test_residualize_linear():
assert_almost_equal(residual_train_X.T.dot(train_confounds), 0)


def test_residualize_targets_linear():
"""sanity checks on implementation"""

min_dim = 6 # atleast 4+ required for make_sparse_uncorrelated
max_dim = 100
for n_samples in np.random.randint(0, 20, 1):
for num_confounds in np.random.randint(min_dim, max_dim, 3):
train_all, _ = make_sparse_uncorrelated(
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this does not generate discrete numerical/integer values! We need a way to broaden the test with multiple data types!

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a bit confused here, does this look better? np.random.randint(0, 20, 1, dtype=int)

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

before updating the tests, how about you try making a jupyter notebook, and play with this class, and write some examples, to see how it works?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hey @raamana sorry for the late response. this summer has been a bit crazy. probably need more time-out here. i noticed that some repos have been snapshotted in Arctic Code Vault, i guess it didn't snapshot PR :p

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NP, take your time. Happy to chat with you to discuss this and make a more precise plan to finish it quickly.

why are you referencing the Arctic Code Vault? did you lose your fork or what?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks! i'm wondering if the snapshot in Arctic Code Vault would include PR? and no, i didn't lose forks.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know - but I am sure your contribution will be clearly recognized :)

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hi @zuxfoucault, would you have time this year to work on this?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, sorry I just saw this.

n_samples=n_samples, n_features=min_dim + num_confounds + 1)

train_y, train_confounds = splitter_X_confounds(train_all, num_confounds)

resid = ResidualizeTarget(model='linear')
resid.fit(train_y, train_confounds)

residual_train_y = resid.transform(train_y, train_confounds)

# residual_train_X and train_confounds must be orthogonal now!
assert_almost_equal(residual_train_y.T.dot(train_confounds), 0)


def test_method_does_not_introduce_bias():
"""
Test to ensure any deconfounding method does NOT introduce bias in a sample
Expand Down