AllenInstitute · neuromusic · Jul 31, 2018 · Oct 17, 2018 · Nov 9, 2018 · Nov 10, 2018
diff --git a/codecov.yml b/codecov.yml
@@ -1,2 +1,4 @@
 ignore:
-  - "neuroglia/calcium/oasis/functions.py"
+  - "neuroglia/datasets/crcns.py"
+  - "neuroglia/datasets/figshare.py"
+  - "neuroglia/calcium/oasis/functions.py"
diff --git a/docs/api.rst b/docs/api.rst
@@ -75,3 +75,11 @@ Tensor transformers
    :toctree: generated/
 
     tensor.ResponseReducer
+
+Datasets
+-------------------
+
+.. autosummary::
+   :toctree: generated/
+
+    datasets.fetch_rat_hippocampus_foraging
diff --git a/examples/plot_rat_hippocampus_foraging.py b/examples/plot_rat_hippocampus_foraging.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+"""
+Dataset: CA1 activity during foraging
+==============================
+
+This is an example of how to access data recorded from CA1 during open field foraging
+
+"""
+
+from neuroglia.datasets import fetch_rat_hippocampus_foraging
+
+dataset = fetch_rat_hippocampus_foraging()
+
+#########################################
+# Let's plot the path in the free field
+
+import matplotlib.pyplot as plt
+plt.plot(dataset.location['x'], dataset.location['y'])
+plt.axis('equal')
+plt.show()
+
+#########################################
+# Create a feature vector, binning spikes for each time point
+
+from neuroglia.spike import Binner
+
+binner = Binner(sample_times=dataset.location['time'])
+response = binner.fit_transform(dataset.spikes)
+
+#########################################
+# Plot CA1 activity
+
+response.plot()
+plt.show()
diff --git a/neuroglia/datasets/__init__.py b/neuroglia/datasets/__init__.py
@@ -1 +1,2 @@
+from .crcns import fetch_rat_hippocampus_foraging
 from .synthetic_calcium import make_calcium_traces
diff --git a/neuroglia/datasets/crcns.py b/neuroglia/datasets/crcns.py
@@ -0,0 +1,285 @@
+# from os.path import exists
+# from os import makedirs, remove
+import os
+import tarfile
+from collections import namedtuple
+import requests
+import pandas as pd
+import numpy as np
+from sklearn.datasets.base import _sha256, _pkl_filepath
+from sklearn.utils import Bunch
+from .utils import get_neuroglia_data_home
+
+try:
+    from itertools import izip as zip
+except ImportError:  # must be python3
+    pass
+
+URL = 'https://portal.nersc.gov/project/crcns/download/index.php'
+
+
+def get_environ_username():
+    return os.environ['CRCNS_USER']
+
+
+def get_environ_password():
+    return os.environ['CRCNS_PASSWORD']
+
+
+Payload = namedtuple('Payload',['username','password','fn','submit'])
+
+
+def _create_payload(username,password,path,filename):
+    datafile = "{}/{}".format(path,filename)
+    return dict(
+        username=username,
+        password=password,
+        fn=datafile,
+        submit='Login'
+    )
+
+
+def _create_local_filename(dest,datafile):
+    if dest is None:
+        dest = os.cwd()
+    return os.path.join(
+        dest,
+        datafile.split('/')[-1],
+    )
+
+
+def crcns_retrieve(request_payload,local_filename):
+    with requests.Session() as s:
+        r = s.post(URL,data=request_payload,stream=True)
+        with open(local_filename, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+    return local_filename
+
+
+def _fetch_crcns_datafile(crcns,local_filename=None,username=None,password=None,chunk_size=1024):
+
+    if local_filename is None:
+        local_filename = crcns.filename
+
+    if os.path.exists(local_filename):
+        checksum = _sha256(local_filename)
+        if crcns.checksum == checksum:
+            return local_filename
+
+    if username is None:
+        username = get_environ_username()
+    if password is None:
+        password = get_environ_password()
+
+    request_payload = _create_payload(
+        username,
+        password,
+        crcns.path,
+        crcns.filename,
+    )
+
+    crcns_retrieve(request_payload,local_filename)
+
+    checksum = _sha256(local_filename)
+
+    if crcns.checksum != checksum:
+        raise IOError("{} has an SHA256 checksum ({}) "
+                      "differing from expected ({}), "
+                      "file may be corrupted.".format(local_filename, checksum,
+                                                      crcns.checksum))
+    return local_filename
+
+CRCNSFileMetadata = namedtuple(
+    'CRCNSFileMetadata',
+    ['filename', 'path', 'checksum'],
+)
+
+def read_spikes_from_tar(f):
+
+    SPIKES_HZ = 20000
+
+    timestamp_files = (
+        'crcns/hc2/ec014.333/ec014.333.res.1',
+        'crcns/hc2/ec014.333/ec014.333.res.2',
+        'crcns/hc2/ec014.333/ec014.333.res.3',
+        'crcns/hc2/ec014.333/ec014.333.res.4',
+        'crcns/hc2/ec014.333/ec014.333.res.5',
+        'crcns/hc2/ec014.333/ec014.333.res.6',
+        'crcns/hc2/ec014.333/ec014.333.res.7',
+        'crcns/hc2/ec014.333/ec014.333.res.8',
+    )
+
+    cluster_files = (
+        'crcns/hc2/ec014.333/ec014.333.clu.1',
+        'crcns/hc2/ec014.333/ec014.333.clu.2',
+        'crcns/hc2/ec014.333/ec014.333.clu.3',
+        'crcns/hc2/ec014.333/ec014.333.clu.4',
+        'crcns/hc2/ec014.333/ec014.333.clu.5',
+        'crcns/hc2/ec014.333/ec014.333.clu.6',
+        'crcns/hc2/ec014.333/ec014.333.clu.7',
+        'crcns/hc2/ec014.333/ec014.333.clu.8',
+    )
+
+    spikes = []
+
+    for timestamps,clusters in zip(timestamp_files,cluster_files):
+        shank = int(timestamps[-1])
+        #print timestamps,clusters
+        time = 0
+
+        ts = f.extractfile(timestamps)
+        clu = f.extractfile(clusters)
+        for frame,cluster in zip(ts.readlines(),clu.readlines()):
+            if int(cluster)>1:
+                spike = dict(
+                    time=float(frame) / SPIKES_HZ,
+                    neuron='s{}:n{:02d}'.format(shank,int(cluster)),
+#                     shank=shank,
+                )
+                spikes.append(spike)
+
+    spikes = pd.DataFrame(spikes)
+    return spikes
+
+def read_location_from_tar(f):
+
+    LOCATION_HZ = 39.06
+
+    location_file = 'crcns/hc2/ec014.333/ec014.333.whl'
+    loc = pd.read_csv(
+        f.extractfile(location_file),
+        sep='\t',
+        header=0,
+        names=['x','y','x2','y2'],
+    )
+    loc = loc.replace(-1.0,np.nan)
+    loc['time'] = loc.index / LOCATION_HZ
+    loc = loc.dropna()
+    return loc
+
+
+
+
+def load_hc2(tar_path):
+
+    with tarfile.open(mode="r:gz", name=tar_path) as f:
+        spikes = read_spikes_from_tar(f)
+        location = read_location_from_tar(f)
+
+    # truncate neuronal data to time when mouse is exploring
+    min_time = location['time'].min()
+    max_time = location['time'].max()
+
+    spikes = spikes[
+        (spikes['time'] >= min_time)
+        & (spikes['time'] <= max_time)
+    ]
+
+    # set approx center of arena to zero in x & y
+    x0 = np.mean([location['x2'].max(),location['x2'].min()])
+    y0 = np.mean([location['y2'].max(),location['y2'].min()])
+
+    location['x'] -= x0
+    location['x2'] -= x0
+    location['y'] -= y0
+    location['y2'] -= y0
+
+    return spikes, location
+
+
+def fetch_rat_hippocampus_foraging(data_home=None,username=None,password=None,download_if_missing=True):
+    """Loader for experiment ec014.333 from the HC-2 dataset on crcns.org
+
+    More info on this dataset: https://crcns.org/data-sets/hc/hc-2/about-hc-2
+
+    To download this data, you must have a CRCNS account. Request an account
+    at https://crcns.org/request-account/
+
+    Warning! The first time you run this function, it will download a 3.3GB file.
+
+    Parameters
+    ----------
+    data_home : optional, default: None
+        Specify another download and cache folder for the datasets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+    username : optional, default: None
+        CRCNS username. All CRCNS datasets need a username to login. If `None`
+        (default), the `CRCNS_USERNAME` environment variable is used.
+    password : optional, default: None
+        CRCNS username & password. All CRCNS datasets need a username to login. If `None`
+        (default), the `CRCNS_USERNAME` environment variable is used.
+    download_if_missing : optional, default=True
+        If False, raise a IOError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    Returns
+    -------
+    dataset : dict-like object with the following attributes:
+    dataset.spikes : dataframe, shape [20640, 2]
+        Each row is a single spike at `time` elicited from neuron `neuron`
+    dataset.location : dataframe, shape (20640,)
+        Each row is a sample of the rat's position, with the location of the
+        head designated by (x,y) and the location of the back designated by
+        (x2, y2)
+
+    Notes
+    ------
+    This dataset consists of 58 simultaneously recorded neurons from the rat
+    hippocampus along with coordinates of its position while it forages in an
+    open arena (180cm x 180cm) for 92 minutes.
+
+    References
+    ----------
+
+    Mizuseki K, Sirota A, Pastalkova E, Buzsaki G. (2009): Multi-unit recordings
+    from the rat hippocampus made during open field foraging
+    http://dx.doi.org/10.6080/K0Z60KZ9
+
+    """
+
+
+    data_home = get_neuroglia_data_home(data_home=data_home)
+    if not os.path.exists(data_home):
+        os.makedirs(data_home)
+
+    # check if local files exist. if so, load, otherwise download raw
+
+    spikes_path = _pkl_filepath(data_home, 'crcns_hc2_spikes.pkl')
+    location_path = _pkl_filepath(data_home, 'crcns_hc2_location.pkl')
+
+
+    if (not os.path.exists(spikes_path)) or (not os.path.exists(location_path)):
+        if not download_if_missing:
+            raise IOError("Data not found and `download_if_missing` is False")
+
+        tar_path = os.path.join(data_home,'crcns_hc2.tar.gz')
+
+        crcns = CRCNSFileMetadata(
+            path = "hc-2/ec014.333",
+            filename = "ec014.333.tar.gz",
+            checksum = '819d9060bcdd439a2024ee44cfb3e7be45056632af052e524e0e23f139c6a260',
+        )
+
+        local_filename = _fetch_crcns_datafile(
+            crcns=crcns,
+            local_filename=tar_path,
+            username=username,
+            password=password,
+        )
+
+        spikes, location = load_hc2(tar_path)
+
+        spikes.to_pickle(spikes_path)
+        location.to_pickle(location_path)
+
+        os.remove(tar_path)
+    else:
+        spikes = pd.read_pickle(spikes_path)
+        location = pd.read_pickle(location_path)
+
+    return Bunch(
+        spikes=spikes,
+        location=location
+    )
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .crcns import fetch_rat_hippocampus_foraging
		from .synthetic_calcium import make_calcium_traces