Add files via upload

brainML · Oct 10, 2022 · e25244b · e25244b
1 parent c3f94e1
commit e25244b
Show file tree

Hide file tree

Showing 3 changed files with 660 additions and 0 deletions.
diff --git a/make_predictions.py b/make_predictions.py
@@ -0,0 +1,103 @@
+import argparse
+import numpy as np
+import warnings
+
+from utils import delay_mat, delay_one, zscore_word_data, run_class_time_CV_crossval_ridge, run_class_time_CV_fmri_crossval_ridge
+from utils import subject_runs, surfaces, transforms, load_transpose_zscore, smooth_run_not_masked, CV_ind
+
+
+from sklearn.decomposition import PCA
+from scipy.stats import zscore
+import scipy.io as sio
+
+import time as tm
+from scipy import signal
+import pickle as pk
+
+SKIP_WORDS = 20
+END_WORDS = 5176
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--subject", required=True)
+    parser.add_argument("--predict_feat_type", required=True)
+    parser.add_argument("--nlp_feat_type", required=True)
+    parser.add_argument("--nlp_feat_dir", required=True)
+    parser.add_argument("--layer", type=int, required=False)
+    parser.add_argument("--sequence_length", type=int, required=False)
+    parser.add_argument("--output_fname_prefix", required=True)
+    parser.add_argument("--output_dir", required=True)
+    parser.add_argument("--delay", type=int, default=0)
+    parser.add_argument("--recording_type", required=True)
+    parser.add_argument("--regress_feat_types", default=0)
+    parser.add_argument("--encoding", type=int, default=1)
+
+    args = parser.parse_args()
+    print(args)
+
+    if args.regress_feat_types != '0':
+        regress_feat_names_list = np.sort(args.regress_feat_types.split('+'))
+    else:
+        regress_feat_names_list = []
+
+    predict_feat_dict = {'feat_type':args.predict_feat_type, 
+                         'nlp_feat_type':args.nlp_feat_type,
+                         'nlp_feat_dir':args.nlp_feat_dir,
+                         'layer':args.layer,
+                         'seq_len':args.sequence_length}
+    warnings.filterwarnings("ignore")
+
+    if args.recording_type == 'meg':
+
+        # predict the previous n words, current word, and the next n words
+        delays =  list(np.arange(args.delay,-args.delay-1,-1)) 
+        #print(delays)
+
+        mat = sio.loadmat('./data/meg/{}_HP_notDetrended_25ms.mat'.format(args.subject))
+        data = mat['data']
+
+        n_words = data.shape[0]
+        n_time = data.shape[2]
+        n_sensor = data.shape[1]
+
+        data = data[SKIP_WORDS:END_WORDS]
+        data = delay_mat(data,delays)
+        print(data.shape)
+
+
+        corrs_t, preds_t, test_t = run_class_time_CV_crossval_ridge(data, 
+                                                                    predict_feat_dict,
+                                                                    regress_feat_names_list = regress_feat_names_list,
+                                                                    delays = delays,
+                                                                    encoding=args.encoding,
+                                                                    detrend = False,
+                                                                    do_correct = [],
+                                                                    splay = [],
+                                                                    do_acc = False,
+                                                                    SKIP_WORDS = SKIP_WORDS,
+                                                                    END_WORDS = END_WORDS)
+    elif args.recording_type == 'fmri':
+
+        # load fMRI data
+        data = np.load('./data/fmri/data_subject_{}.npy'.format(args.subject))
+        corrs_t, _, _, preds_t, test_t = run_class_time_CV_fmri_crossval_ridge(data,
+                                                                    predict_feat_dict,
+                                                                    regress_feat_names_list=regress_feat_names_list)
+
+    else:
+        print('Unrecognized recording_type {}. Options: meg, fmri'.format(args.recording_type))
+
+
+    if args.predict_feat_type == 'elmo' or args.predict_feat_type == 'bert':
+        if args.regress_feat_types != '0':
+            fname = 'predict_{}_with_{}_layer_{}_len_{}_regress_out_{}'.format(args.output_fname_prefix, args.predict_feat_type, args.layer, args.sequence_length, '+'.join(regress_feat_names_list))
+        else:
+            fname = 'predict_{}_with_{}_layer_{}_len_{}'.format(args.output_fname_prefix, args.predict_feat_type, args.layer, args.sequence_length)
+    else:
+        if args.regress_feat_types != '0':
+            fname = 'predict_{}_with_{}_regress_out_{}'.format(args.output_fname_prefix, args.predict_feat_type, '+'.join(regress_feat_names_list))
+        else:
+            fname = 'predict_{}_with_{}'.format(args.output_fname_prefix, args.predict_feat_type)
+
+    print('saving: {}'.format(args.output_dir + fname))
+    np.save(args.output_dir + fname + '.npy', {'corrs_t':corrs_t,'preds_t':preds_t,'test_t':test_t}) 
diff --git a/ridge_tools.py b/ridge_tools.py
@@ -0,0 +1,102 @@
+from numpy.linalg import inv, svd
+import numpy as np
+from sklearn.model_selection import KFold
+from sklearn.linear_model import Ridge, RidgeCV
+from sklearn.kernel_ridge import KernelRidge
+import time
+from scipy.stats import zscore
+
+def corr(X,Y):
+    return np.mean(zscore(X)*zscore(Y),0)
+
+def R2(Pred,Real):
+    SSres = np.mean((Real-Pred)**2,0)
+    SStot = np.var(Real,0)
+    return np.nan_to_num(1-SSres/SStot)
+
+def R2r(Pred,Real):
+    R2rs = R2(Pred,Real)
+    ind_neg = R2rs<0
+    R2rs = np.abs(R2rs)
+    R2rs = np.sqrt(R2rs)
+    R2rs[ind_neg] *= - 1
+    return R2rs
+
+def ridge(X,Y,lmbda):
+    return np.dot(inv(X.T.dot(X)+lmbda*np.eye(X.shape[1])),X.T.dot(Y))
+
+def ridge_by_lambda(X, Y, Xval, Yval, lambdas=np.array([0.1,1,10,100,1000])):
+    error = np.zeros((lambdas.shape[0],Y.shape[1]))
+    for idx,lmbda in enumerate(lambdas):
+        weights = ridge(X,Y,lmbda)
+        error[idx] = 1 - R2(np.dot(Xval,weights),Yval)
+    return error
+
+def ridge_sk(X,Y,lmbda):
+    rd = Ridge(alpha = lmbda)
+    rd.fit(X,Y)
+    return rd.coef_.T
+
+def ridgeCV_sk(X,Y,lmbdas):
+    rd = RidgeCV(alphas = lmbdas)
+    rd.fit(X,Y)
+    return rd.coef_.T
+
+def ridge_by_lambda_sk(X, Y, Xval, Yval, lambdas=np.array([0.1,1,10,100,1000])):
+    error = np.zeros((lambdas.shape[0],Y.shape[1]))
+    for idx,lmbda in enumerate(lambdas):
+        weights = ridge_sk(X,Y,lmbda)
+        error[idx] = 1 -  R2(np.dot(Xval,weights),Yval)
+    return error
+
+
+def cross_val_ridge(train_features,train_data, test_features, n_splits = 10,
+                    lambdas = np.array([10**i for i in range(-6,10)]),
+                    method = 'plain',
+                    do_plot = False):
+
+    ridge_1 = dict(plain = ridge_by_lambda,
+                   ridge_sk = ridge_by_lambda_sk)[method]
+    ridge_2 = dict(plain = ridge,
+                   ridge_sk = ridge_sk)[method]
+
+    n_voxels = train_data.shape[1]
+    nL = lambdas.shape[0]
+    r_cv = np.zeros((nL, train_data.shape[1]))
+
+    print(train_features.shape, train_data.shape)
+
+    kf = KFold(n_splits=n_splits)
+    start_t = time.time()
+    for icv, (trn, val) in enumerate(kf.split(train_data)):
+        #print('ntrain = {}'.format(train_features[trn].shape[0]))
+        cost = ridge_1(train_features[trn],train_data[trn],
+                               train_features[val],train_data[val],
+                               lambdas=lambdas)
+        if do_plot:
+            import matplotlib.pyplot as plt
+            plt.figure()
+            plt.imshow(cost,aspect = 'auto')
+        r_cv += cost
+    if do_plot:
+        plt.figure()
+        plt.imshow(r_cv,aspect='auto',cmap = 'RdBu_r');
+
+    argmin_lambda = np.argmin(r_cv,axis = 0)
+    preds = np.zeros((test_features.shape[0],train_data.shape[1]))
+    preds_train = np.zeros((train_features.shape[0],train_data.shape[1]))
+    weights_kernel_space = np.zeros((train_features.shape[0],train_data.shape[1]))
+    for idx_lambda in range(lambdas.shape[0]): # this is much faster than iterating over voxels!
+        idx_vox = argmin_lambda == idx_lambda
+        print(idx_lambda, argmin_lambda)
+        if sum(idx_vox) == 0:
+            continue
+
+
+        print(train_features.shape, train_data[:,idx_vox].shape, test_features.shape)
+        preds[:,idx_vox], preds_train[:,idx_vox], weights_kernel_space[:,idx_vox] = ridge_2(train_features, train_data[:,idx_vox], test_features, lambdas[idx_lambda])
+    if do_plot:
+        plt.figure()
+        plt.imshow(weights,aspect='auto',cmap = 'RdBu_r',vmin = -0.5,vmax = 0.5);
+
+    return preds,preds_train, weights_kernel_space, np.array([lambdas[i] for i in argmin_lambda])