From 32cfa3ed897f6a4476c50d01d7c5878851298adc Mon Sep 17 00:00:00 2001 From: Justin Kiggins Date: Tue, 31 Jul 2018 14:33:58 -0700 Subject: [PATCH] first pass at downloading crcns datasets :bug: uses six.zip for py2/3 compat better fix typo ignore dataset downloads for code coverage adds a test --- codecov.yml | 4 +- docs/api.rst | 8 + neuroglia/datasets/__init__.py | 1 + neuroglia/datasets/crcns.py | 285 +++++++++++++++++++++++++++++++++ neuroglia/datasets/figshare.py | 106 ++++++++++++ neuroglia/datasets/utils.py | 15 ++ requirements.txt | 1 + tests/datasets/test_utils.py | 12 ++ 8 files changed, 431 insertions(+), 1 deletion(-) create mode 100644 neuroglia/datasets/crcns.py create mode 100644 neuroglia/datasets/figshare.py create mode 100644 neuroglia/datasets/utils.py create mode 100644 tests/datasets/test_utils.py diff --git a/codecov.yml b/codecov.yml index 5172eb4..d6df231 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,2 +1,4 @@ ignore: - - "neuroglia/calcium/oasis/functions.py" + - "neuroglia/datasets/crcns.py" + - "neuroglia/datasets/figshare.py" + - "neuroglia/calcium/oasis/functions.py" \ No newline at end of file diff --git a/docs/api.rst b/docs/api.rst index edace06..ff4ebff 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -79,3 +79,11 @@ Tensor transformers :toctree: generated/ tensor.ResponseReducer + +Datasets +------------------- + +.. autosummary:: + :toctree: generated/ + + datasets.fetch_rat_hippocampus_foraging diff --git a/neuroglia/datasets/__init__.py b/neuroglia/datasets/__init__.py index 8874339..9995341 100644 --- a/neuroglia/datasets/__init__.py +++ b/neuroglia/datasets/__init__.py @@ -1 +1,2 @@ +from .crcns import fetch_rat_hippocampus_foraging from .synthetic_calcium import make_calcium_traces diff --git a/neuroglia/datasets/crcns.py b/neuroglia/datasets/crcns.py new file mode 100644 index 0000000..6272dc5 --- /dev/null +++ b/neuroglia/datasets/crcns.py @@ -0,0 +1,285 @@ +# from os.path import exists +# from os import makedirs, remove +import os +import tarfile +from collections import namedtuple +import requests +import pandas as pd +import numpy as np +from sklearn.datasets.base import _sha256, _pkl_filepath +from sklearn.utils import Bunch +from .utils import get_neuroglia_data_home + +try: + from itertools import izip as zip +except ImportError: # must be python3 + pass + +URL = 'https://portal.nersc.gov/project/crcns/download/index.php' + + +def get_environ_username(): + return os.environ['CRCNS_USER'] + + +def get_environ_password(): + return os.environ['CRCNS_PASSWORD'] + + +Payload = namedtuple('Payload',['username','password','fn','submit']) + + +def _create_payload(username,password,path,filename): + datafile = "{}/{}".format(path,filename) + return dict( + username=username, + password=password, + fn=datafile, + submit='Login' + ) + + +def _create_local_filename(dest,datafile): + if dest is None: + dest = os.cwd() + return os.path.join( + dest, + datafile.split('/')[-1], + ) + + +def crcns_retrieve(request_payload,local_filename): + with requests.Session() as s: + r = s.post(URL,data=request_payload,stream=True) + with open(local_filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + return local_filename + + +def _fetch_crcns_datafile(crcns,local_filename=None,username=None,password=None,chunk_size=1024): + + if local_filename is None: + local_filename = crcns.filename + + if os.path.exists(local_filename): + checksum = _sha256(local_filename) + if crcns.checksum == checksum: + return local_filename + + if username is None: + username = get_environ_username() + if password is None: + password = get_environ_password() + + request_payload = _create_payload( + username, + password, + crcns.path, + crcns.filename, + ) + + crcns_retrieve(request_payload,local_filename) + + checksum = _sha256(local_filename) + + if crcns.checksum != checksum: + raise IOError("{} has an SHA256 checksum ({}) " + "differing from expected ({}), " + "file may be corrupted.".format(local_filename, checksum, + crcns.checksum)) + return local_filename + +CRCNSFileMetadata = namedtuple( + 'CRCNSFileMetadata', + ['filename', 'path', 'checksum'], +) + +def read_spikes_from_tar(f): + + SPIKES_HZ = 20000 + + timestamp_files = ( + 'crcns/hc2/ec014.333/ec014.333.res.1', + 'crcns/hc2/ec014.333/ec014.333.res.2', + 'crcns/hc2/ec014.333/ec014.333.res.3', + 'crcns/hc2/ec014.333/ec014.333.res.4', + 'crcns/hc2/ec014.333/ec014.333.res.5', + 'crcns/hc2/ec014.333/ec014.333.res.6', + 'crcns/hc2/ec014.333/ec014.333.res.7', + 'crcns/hc2/ec014.333/ec014.333.res.8', + ) + + cluster_files = ( + 'crcns/hc2/ec014.333/ec014.333.clu.1', + 'crcns/hc2/ec014.333/ec014.333.clu.2', + 'crcns/hc2/ec014.333/ec014.333.clu.3', + 'crcns/hc2/ec014.333/ec014.333.clu.4', + 'crcns/hc2/ec014.333/ec014.333.clu.5', + 'crcns/hc2/ec014.333/ec014.333.clu.6', + 'crcns/hc2/ec014.333/ec014.333.clu.7', + 'crcns/hc2/ec014.333/ec014.333.clu.8', + ) + + spikes = [] + + for timestamps,clusters in zip(timestamp_files,cluster_files): + shank = int(timestamps[-1]) + #print timestamps,clusters + time = 0 + + ts = f.extractfile(timestamps) + clu = f.extractfile(clusters) + for frame,cluster in zip(ts.readlines(),clu.readlines()): + if int(cluster)>1: + spike = dict( + time=float(frame) / SPIKES_HZ, + neuron='{}-{:02d}'.format(shank,int(cluster)), +# shank=shank, + ) + spikes.append(spike) + + spikes = pd.DataFrame(spikes) + return spikes + +def read_location_from_tar(f): + + LOCATION_HZ = 39.06 + + location_file = 'crcns/hc2/ec014.333/ec014.333.whl' + loc = pd.read_csv( + f.extractfile(location_file), + sep='\t', + header=0, + names=['x','y','x2','y2'], + ) + loc = loc.replace(-1.0,np.nan) + loc['time'] = loc.index / LOCATION_HZ + loc = loc.dropna() + return loc + + + + +def load_hc2(tar_path): + + with tarfile.open(mode="r:gz", name=tar_path) as f: + spikes = read_spikes_from_tar(f) + location = read_location_from_tar(f) + + # truncate neuronal data to time when mouse is exploring + min_time = location['time'].min() + max_time = location['time'].max() + + spikes = spikes[ + (spikes['time'] >= min_time) + & (spikes['time'] <= max_time) + ] + + # set approx center of arena to zero in x & y + x0 = np.mean([location['x2'].max(),location['x2'].min()]) + y0 = np.mean([location['y2'].max(),location['y2'].min()]) + + location['x'] -= x0 + location['x2'] -= x0 + location['y'] -= y0 + location['y2'] -= y0 + + return spikes, location + + +def fetch_rat_hippocampus_foraging(data_home=None,username=None,password=None,download_if_missing=True): + """Loader for experiment ec014.333 from the HC-2 dataset on crcns.org + + More info on this dataset: https://crcns.org/data-sets/hc/hc-2/about-hc-2 + + To download this data, you must have a CRCNS account. Request an account + at https://crcns.org/request-account/ + + Warning! The first time you run this function, it will download a 3.3GB file. + + Parameters + ---------- + data_home : optional, default: None + Specify another download and cache folder for the datasets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + username : optional, default: None + CRCNS username. All CRCNS datasets need a username to login. If `None` + (default), the `CRCNS_USERNAME` environment variable is used. + password : optional, default: None + CRCNS username & password. All CRCNS datasets need a username to login. If `None` + (default), the `CRCNS_USERNAME` environment variable is used. + download_if_missing : optional, default=True + If False, raise a IOError if the data is not locally available + instead of trying to download the data from the source site. + + Returns + ------- + dataset : dict-like object with the following attributes: + dataset.spikes : dataframe, shape [20640, 2] + Each row is a single spike at `time` elicited from neuron `neuron` + dataset.location : dataframe, shape (20640,) + Each row is a sample of the rat's position, with the location of the + head designated by (x,y) and the location of the back designated by + (x2, y2) + + Notes + ------ + This dataset consists of 58 simultaneously recorded neurons from the rat + hippocampus along with coordinates of its position while it forages in an + open arena (180cm x 180cm) for 92 minutes. + + References + ---------- + + Mizuseki K, Sirota A, Pastalkova E, Buzsaki G. (2009): Multi-unit recordings + from the rat hippocampus made during open field foraging + http://dx.doi.org/10.6080/K0Z60KZ9 + + """ + + + data_home = get_neuroglia_data_home(data_home=data_home) + if not os.path.exists(data_home): + os.makedirs(data_home) + + # check if local files exist. if so, load, otherwise download raw + + spikes_path = _pkl_filepath(data_home, 'crcns_hc2_spikes.pkl') + location_path = _pkl_filepath(data_home, 'crcns_hc2_location.pkl') + + + if (not os.path.exists(spikes_path)) or (not os.path.exists(location_path)): + if not download_if_missing: + raise IOError("Data not found and `download_if_missing` is False") + + tar_path = os.path.join(data_home,'crcns_hc2.tar.gz') + + crcns = CRCNSFileMetadata( + path = "hc-2/ec014.333", + filename = "ec014.333.tar.gz", + checksum = '819d9060bcdd439a2024ee44cfb3e7be45056632af052e524e0e23f139c6a260', + ) + + local_filename = _fetch_crcns_datafile( + crcns=crcns, + local_filename=tar_path, + username=username, + password=password, + ) + + spikes, location = load_hc2(tar_path) + + spikes.to_pickle(spikes_path) + location.to_pickle(location_path) + + os.remove(tar_path) + else: + spikes = pd.read_pickle(spikes_path) + location = pd.read_pickle(location_path) + + return Bunch( + spikes=spikes, + location=location + ) diff --git a/neuroglia/datasets/figshare.py b/neuroglia/datasets/figshare.py new file mode 100644 index 0000000..e01bc98 --- /dev/null +++ b/neuroglia/datasets/figshare.py @@ -0,0 +1,106 @@ +import os +import numpy as np +import pandas as pd +from collections import namedtuple +from scipy.io import loadmat +from six.moves.urllib.request import urlretrieve +from sklearn.datasets.base import _pkl_filepath +from sklearn.utils import Bunch +from .utils import _md5, get_neuroglia_data_home + + +FigshareFileMetadata = namedtuple( + 'FigshareFileMetadata', + ['url', 'filename', 'md5'], +) + + +def fetch_10k_neurons(data_home=None,download_if_missing=True): + + data_home = get_neuroglia_data_home(data_home=data_home) + if not os.path.exists(data_home): + os.makedirs(data_home) + + # check if local files exist. if so, load, otherwise download raw + + calcium_path = _pkl_filepath(data_home, 'tenk_neurons.pkl') + behavior_path = _pkl_filepath(data_home, 'tenk_behavior.pkl') + + if (not os.path.exists(calcium_path)) or (not os.path.exists(behavior_path)): + if not download_if_missing: + raise IOError("Data not found and `download_if_missing` is False") + + mat_path = os.path.join(data_home, 'figshare_10k.mat') + + figshare = FigshareFileMetadata( + url="https://ndownloader.figshare.com/files/11152646", + filename="spont_M161025_MP030_2016-11-20.mat", + md5="54ffdc817e82710952dbee2655d4d1d2", + ) + + _fetch_figshare_datafile( + figshare=figshare, + local_filename=mat_path, + ) + + calcium, behavior = load_10k(mat_path) + + calcium.to_pickle(calcium_path) + behavior.to_pickle(behavior_path) + + os.remove(mat_path) + else: + calcium = pd.read_pickle(calcium_path) + behavior = pd.read_pickle(behavior_path) + + return Bunch( + calcium=calcium, + behavior=behavior, + ) + + +def load_10k(mat_path): + + # read in the data and return neuroglia-friendly dataframes + data = loadmat(mat_path) + + tres = 0.25 + time = np.arange(0, data['Fsp'].shape[1] * tres, tres) + + calcium_events = pd.DataFrame( + data['Fsp'].T, + index=time, + columns=['neuron_{}'.format(ii) for ii in range(data['Fsp'].shape[0])], + ) + + behavior = pd.DataFrame( + { + 'speed': np.squeeze(data['beh']['runSpeed'][0][0]), + 'pupil_size': np.squeeze(data['beh']['pupil'][0][0]['area'][0][0]), + }, + index=time, + ) + + return calcium_events, behavior + + +def _fetch_figshare_datafile(figshare, local_filename=None, chunk_size=1024): + + if local_filename is None: + local_filename = figshare.filename + + if os.path.exists(local_filename): + checksum = _md5(local_filename) + if figshare.checksum == checksum: + return local_filename + + urlretrieve(figshare.url, local_filename) + + checksum = _md5(local_filename) + + if figshare.checksum != checksum: + raise IOError("{} has an md5 checksum ({}) " + "differing from expected ({}), " + "file may be corrupted.".format(local_filename, checksum, + figshare.checksum)) + return local_filename diff --git a/neuroglia/datasets/utils.py b/neuroglia/datasets/utils.py new file mode 100644 index 0000000..437f7f5 --- /dev/null +++ b/neuroglia/datasets/utils.py @@ -0,0 +1,15 @@ +import os +import hashlib +from sklearn.datasets.base import get_data_home + + +def get_neuroglia_data_home(data_home=None): + # return get_data_home(data_home=data_home) + return os.path.join(get_data_home(data_home=data_home),'neuroglia') + + +def _md5(filename): + with open(filename) as f: + data = f.read() + md5_returned = hashlib.md5(data).hexdigest() + return md5_returned diff --git a/requirements.txt b/requirements.txt index eaff179..737fa13 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ scipy pandas xarray scikit-learn +requests diff --git a/tests/datasets/test_utils.py b/tests/datasets/test_utils.py new file mode 100644 index 0000000..c090d35 --- /dev/null +++ b/tests/datasets/test_utils.py @@ -0,0 +1,12 @@ +from neuroglia.datasets.utils import get_neuroglia_data_home +import os + +HOME = os.path.join( + os.path.expanduser("~"), + 'data', +) + +def test_get_neuroglia_data_home(): + EXPECTED = os.path.join(HOME,'neuroglia') + data_home = get_neuroglia_data_home(data_home=HOME) + assert data_home==EXPECTED