From 32cfa3ed897f6a4476c50d01d7c5878851298adc Mon Sep 17 00:00:00 2001
From: Justin Kiggins <justin.kiggins@gmail.com>
Date: Tue, 31 Jul 2018 14:33:58 -0700
Subject: [PATCH]  first pass at downloading crcns datasets

:bug: uses six.zip for py2/3 compat


better


fix typo


ignore dataset downloads for code coverage


adds a test
---
 codecov.yml                    |   4 +-
 docs/api.rst                   |   8 +
 neuroglia/datasets/__init__.py |   1 +
 neuroglia/datasets/crcns.py    | 285 +++++++++++++++++++++++++++++++++
 neuroglia/datasets/figshare.py | 106 ++++++++++++
 neuroglia/datasets/utils.py    |  15 ++
 requirements.txt               |   1 +
 tests/datasets/test_utils.py   |  12 ++
 8 files changed, 431 insertions(+), 1 deletion(-)
 create mode 100644 neuroglia/datasets/crcns.py
 create mode 100644 neuroglia/datasets/figshare.py
 create mode 100644 neuroglia/datasets/utils.py
 create mode 100644 tests/datasets/test_utils.py

diff --git a/codecov.yml b/codecov.yml
index 5172eb4..d6df231 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,2 +1,4 @@
 ignore:
-  - "neuroglia/calcium/oasis/functions.py"
+  - "neuroglia/datasets/crcns.py"
+  - "neuroglia/datasets/figshare.py"
+  - "neuroglia/calcium/oasis/functions.py"
\ No newline at end of file
diff --git a/docs/api.rst b/docs/api.rst
index edace06..ff4ebff 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -79,3 +79,11 @@ Tensor transformers
    :toctree: generated/
 
     tensor.ResponseReducer
+
+Datasets
+-------------------
+
+.. autosummary::
+   :toctree: generated/
+
+    datasets.fetch_rat_hippocampus_foraging
diff --git a/neuroglia/datasets/__init__.py b/neuroglia/datasets/__init__.py
index 8874339..9995341 100644
--- a/neuroglia/datasets/__init__.py
+++ b/neuroglia/datasets/__init__.py
@@ -1 +1,2 @@
+from .crcns import fetch_rat_hippocampus_foraging
 from .synthetic_calcium import make_calcium_traces
diff --git a/neuroglia/datasets/crcns.py b/neuroglia/datasets/crcns.py
new file mode 100644
index 0000000..6272dc5
--- /dev/null
+++ b/neuroglia/datasets/crcns.py
@@ -0,0 +1,285 @@
+# from os.path import exists
+# from os import makedirs, remove
+import os
+import tarfile
+from collections import namedtuple
+import requests
+import pandas as pd
+import numpy as np
+from sklearn.datasets.base import _sha256, _pkl_filepath
+from sklearn.utils import Bunch
+from .utils import get_neuroglia_data_home
+
+try:
+    from itertools import izip as zip
+except ImportError:  # must be python3
+    pass
+
+URL = 'https://portal.nersc.gov/project/crcns/download/index.php'
+
+
+def get_environ_username():
+    return os.environ['CRCNS_USER']
+
+
+def get_environ_password():
+    return os.environ['CRCNS_PASSWORD']
+
+
+Payload = namedtuple('Payload',['username','password','fn','submit'])
+
+
+def _create_payload(username,password,path,filename):
+    datafile = "{}/{}".format(path,filename)
+    return dict(
+        username=username,
+        password=password,
+        fn=datafile,
+        submit='Login'
+    )
+
+
+def _create_local_filename(dest,datafile):
+    if dest is None:
+        dest = os.cwd()
+    return os.path.join(
+        dest,
+        datafile.split('/')[-1],
+    )
+
+
+def crcns_retrieve(request_payload,local_filename):
+    with requests.Session() as s:
+        r = s.post(URL,data=request_payload,stream=True)
+        with open(local_filename, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+    return local_filename
+
+
+def _fetch_crcns_datafile(crcns,local_filename=None,username=None,password=None,chunk_size=1024):
+
+    if local_filename is None:
+        local_filename = crcns.filename
+
+    if os.path.exists(local_filename):
+        checksum = _sha256(local_filename)
+        if crcns.checksum == checksum:
+            return local_filename
+
+    if username is None:
+        username = get_environ_username()
+    if password is None:
+        password = get_environ_password()
+
+    request_payload = _create_payload(
+        username,
+        password,
+        crcns.path,
+        crcns.filename,
+    )
+
+    crcns_retrieve(request_payload,local_filename)
+
+    checksum = _sha256(local_filename)
+
+    if crcns.checksum != checksum:
+        raise IOError("{} has an SHA256 checksum ({}) "
+                      "differing from expected ({}), "
+                      "file may be corrupted.".format(local_filename, checksum,
+                                                      crcns.checksum))
+    return local_filename
+
+CRCNSFileMetadata = namedtuple(
+    'CRCNSFileMetadata',
+    ['filename', 'path', 'checksum'],
+)
+
+def read_spikes_from_tar(f):
+
+    SPIKES_HZ = 20000
+
+    timestamp_files = (
+        'crcns/hc2/ec014.333/ec014.333.res.1',
+        'crcns/hc2/ec014.333/ec014.333.res.2',
+        'crcns/hc2/ec014.333/ec014.333.res.3',
+        'crcns/hc2/ec014.333/ec014.333.res.4',
+        'crcns/hc2/ec014.333/ec014.333.res.5',
+        'crcns/hc2/ec014.333/ec014.333.res.6',
+        'crcns/hc2/ec014.333/ec014.333.res.7',
+        'crcns/hc2/ec014.333/ec014.333.res.8',
+    )
+
+    cluster_files = (
+        'crcns/hc2/ec014.333/ec014.333.clu.1',
+        'crcns/hc2/ec014.333/ec014.333.clu.2',
+        'crcns/hc2/ec014.333/ec014.333.clu.3',
+        'crcns/hc2/ec014.333/ec014.333.clu.4',
+        'crcns/hc2/ec014.333/ec014.333.clu.5',
+        'crcns/hc2/ec014.333/ec014.333.clu.6',
+        'crcns/hc2/ec014.333/ec014.333.clu.7',
+        'crcns/hc2/ec014.333/ec014.333.clu.8',
+    )
+
+    spikes = []
+
+    for timestamps,clusters in zip(timestamp_files,cluster_files):
+        shank = int(timestamps[-1])
+        #print timestamps,clusters
+        time = 0
+
+        ts = f.extractfile(timestamps)
+        clu = f.extractfile(clusters)
+        for frame,cluster in zip(ts.readlines(),clu.readlines()):
+            if int(cluster)>1:
+                spike = dict(
+                    time=float(frame) / SPIKES_HZ,
+                    neuron='{}-{:02d}'.format(shank,int(cluster)),
+#                     shank=shank,
+                )
+                spikes.append(spike)
+
+    spikes = pd.DataFrame(spikes)
+    return spikes
+
+def read_location_from_tar(f):
+
+    LOCATION_HZ = 39.06
+
+    location_file = 'crcns/hc2/ec014.333/ec014.333.whl'
+    loc = pd.read_csv(
+        f.extractfile(location_file),
+        sep='\t',
+        header=0,
+        names=['x','y','x2','y2'],
+    )
+    loc = loc.replace(-1.0,np.nan)
+    loc['time'] = loc.index / LOCATION_HZ
+    loc = loc.dropna()
+    return loc
+
+
+
+
+def load_hc2(tar_path):
+
+    with tarfile.open(mode="r:gz", name=tar_path) as f:
+        spikes = read_spikes_from_tar(f)
+        location = read_location_from_tar(f)
+
+    # truncate neuronal data to time when mouse is exploring
+    min_time = location['time'].min()
+    max_time = location['time'].max()
+
+    spikes = spikes[
+        (spikes['time'] >= min_time)
+        & (spikes['time'] <= max_time)
+    ]
+
+    # set approx center of arena to zero in x & y
+    x0 = np.mean([location['x2'].max(),location['x2'].min()])
+    y0 = np.mean([location['y2'].max(),location['y2'].min()])
+
+    location['x'] -= x0
+    location['x2'] -= x0
+    location['y'] -= y0
+    location['y2'] -= y0
+
+    return spikes, location
+
+
+def fetch_rat_hippocampus_foraging(data_home=None,username=None,password=None,download_if_missing=True):
+    """Loader for experiment ec014.333 from the HC-2 dataset on crcns.org
+
+    More info on this dataset: https://crcns.org/data-sets/hc/hc-2/about-hc-2
+
+    To download this data, you must have a CRCNS account. Request an account
+    at https://crcns.org/request-account/
+
+    Warning! The first time you run this function, it will download a 3.3GB file.
+
+    Parameters
+    ----------
+    data_home : optional, default: None
+        Specify another download and cache folder for the datasets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+    username : optional, default: None
+        CRCNS username. All CRCNS datasets need a username to login. If `None`
+        (default), the `CRCNS_USERNAME` environment variable is used.
+    password : optional, default: None
+        CRCNS username & password. All CRCNS datasets need a username to login. If `None`
+        (default), the `CRCNS_USERNAME` environment variable is used.
+    download_if_missing : optional, default=True
+        If False, raise a IOError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    Returns
+    -------
+    dataset : dict-like object with the following attributes:
+    dataset.spikes : dataframe, shape [20640, 2]
+        Each row is a single spike at `time` elicited from neuron `neuron`
+    dataset.location : dataframe, shape (20640,)
+        Each row is a sample of the rat's position, with the location of the
+        head designated by (x,y) and the location of the back designated by
+        (x2, y2)
+
+    Notes
+    ------
+    This dataset consists of 58 simultaneously recorded neurons from the rat
+    hippocampus along with coordinates of its position while it forages in an
+    open arena (180cm x 180cm) for 92 minutes.
+
+    References
+    ----------
+
+    Mizuseki K, Sirota A, Pastalkova E, Buzsaki G. (2009): Multi-unit recordings
+    from the rat hippocampus made during open field foraging
+    http://dx.doi.org/10.6080/K0Z60KZ9
+
+    """
+
+
+    data_home = get_neuroglia_data_home(data_home=data_home)
+    if not os.path.exists(data_home):
+        os.makedirs(data_home)
+
+    # check if local files exist. if so, load, otherwise download raw
+
+    spikes_path = _pkl_filepath(data_home, 'crcns_hc2_spikes.pkl')
+    location_path = _pkl_filepath(data_home, 'crcns_hc2_location.pkl')
+
+
+    if (not os.path.exists(spikes_path)) or (not os.path.exists(location_path)):
+        if not download_if_missing:
+            raise IOError("Data not found and `download_if_missing` is False")
+
+        tar_path = os.path.join(data_home,'crcns_hc2.tar.gz')
+
+        crcns = CRCNSFileMetadata(
+            path = "hc-2/ec014.333",
+            filename = "ec014.333.tar.gz",
+            checksum = '819d9060bcdd439a2024ee44cfb3e7be45056632af052e524e0e23f139c6a260',
+        )
+
+        local_filename = _fetch_crcns_datafile(
+            crcns=crcns,
+            local_filename=tar_path,
+            username=username,
+            password=password,
+        )
+
+        spikes, location = load_hc2(tar_path)
+
+        spikes.to_pickle(spikes_path)
+        location.to_pickle(location_path)
+
+        os.remove(tar_path)
+    else:
+        spikes = pd.read_pickle(spikes_path)
+        location = pd.read_pickle(location_path)
+
+    return Bunch(
+        spikes=spikes,
+        location=location
+    )
diff --git a/neuroglia/datasets/figshare.py b/neuroglia/datasets/figshare.py
new file mode 100644
index 0000000..e01bc98
--- /dev/null
+++ b/neuroglia/datasets/figshare.py
@@ -0,0 +1,106 @@
+import os
+import numpy as np
+import pandas as pd
+from collections import namedtuple
+from scipy.io import loadmat
+from six.moves.urllib.request import urlretrieve
+from sklearn.datasets.base import _pkl_filepath
+from sklearn.utils import Bunch
+from .utils import _md5, get_neuroglia_data_home
+
+
+FigshareFileMetadata = namedtuple(
+    'FigshareFileMetadata',
+    ['url', 'filename', 'md5'],
+)
+
+
+def fetch_10k_neurons(data_home=None,download_if_missing=True):
+
+    data_home = get_neuroglia_data_home(data_home=data_home)
+    if not os.path.exists(data_home):
+        os.makedirs(data_home)
+
+    # check if local files exist. if so, load, otherwise download raw
+
+    calcium_path = _pkl_filepath(data_home, 'tenk_neurons.pkl')
+    behavior_path = _pkl_filepath(data_home, 'tenk_behavior.pkl')
+
+    if (not os.path.exists(calcium_path)) or (not os.path.exists(behavior_path)):
+        if not download_if_missing:
+            raise IOError("Data not found and `download_if_missing` is False")
+
+        mat_path = os.path.join(data_home, 'figshare_10k.mat')
+
+        figshare = FigshareFileMetadata(
+            url="https://ndownloader.figshare.com/files/11152646",
+            filename="spont_M161025_MP030_2016-11-20.mat",
+            md5="54ffdc817e82710952dbee2655d4d1d2",
+        )
+
+        _fetch_figshare_datafile(
+            figshare=figshare,
+            local_filename=mat_path,
+        )
+
+        calcium, behavior = load_10k(mat_path)
+
+        calcium.to_pickle(calcium_path)
+        behavior.to_pickle(behavior_path)
+
+        os.remove(mat_path)
+    else:
+        calcium = pd.read_pickle(calcium_path)
+        behavior = pd.read_pickle(behavior_path)
+
+    return Bunch(
+        calcium=calcium,
+        behavior=behavior,
+    )
+
+
+def load_10k(mat_path):
+
+    # read in the data and return neuroglia-friendly dataframes
+    data = loadmat(mat_path)
+
+    tres = 0.25
+    time = np.arange(0, data['Fsp'].shape[1] * tres, tres)
+
+    calcium_events = pd.DataFrame(
+        data['Fsp'].T,
+        index=time,
+        columns=['neuron_{}'.format(ii) for ii in range(data['Fsp'].shape[0])],
+    )
+
+    behavior = pd.DataFrame(
+        {
+            'speed': np.squeeze(data['beh']['runSpeed'][0][0]),
+            'pupil_size': np.squeeze(data['beh']['pupil'][0][0]['area'][0][0]),
+        },
+        index=time,
+    )
+
+    return calcium_events, behavior
+
+
+def _fetch_figshare_datafile(figshare, local_filename=None, chunk_size=1024):
+
+    if local_filename is None:
+        local_filename = figshare.filename
+
+    if os.path.exists(local_filename):
+        checksum = _md5(local_filename)
+        if figshare.checksum == checksum:
+            return local_filename
+
+    urlretrieve(figshare.url, local_filename)
+
+    checksum = _md5(local_filename)
+
+    if figshare.checksum != checksum:
+        raise IOError("{} has an md5 checksum ({}) "
+                      "differing from expected ({}), "
+                      "file may be corrupted.".format(local_filename, checksum,
+                                                      figshare.checksum))
+    return local_filename
diff --git a/neuroglia/datasets/utils.py b/neuroglia/datasets/utils.py
new file mode 100644
index 0000000..437f7f5
--- /dev/null
+++ b/neuroglia/datasets/utils.py
@@ -0,0 +1,15 @@
+import os
+import hashlib
+from sklearn.datasets.base import get_data_home
+
+
+def get_neuroglia_data_home(data_home=None):
+    # return get_data_home(data_home=data_home)
+    return os.path.join(get_data_home(data_home=data_home),'neuroglia')
+
+
+def _md5(filename):
+    with open(filename) as f:
+        data = f.read()
+        md5_returned = hashlib.md5(data).hexdigest()
+    return md5_returned
diff --git a/requirements.txt b/requirements.txt
index eaff179..737fa13 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ scipy
 pandas
 xarray
 scikit-learn
+requests
diff --git a/tests/datasets/test_utils.py b/tests/datasets/test_utils.py
new file mode 100644
index 0000000..c090d35
--- /dev/null
+++ b/tests/datasets/test_utils.py
@@ -0,0 +1,12 @@
+from neuroglia.datasets.utils import get_neuroglia_data_home
+import os
+
+HOME = os.path.join(
+    os.path.expanduser("~"),
+    'data',
+)
+
+def test_get_neuroglia_data_home():
+    EXPECTED = os.path.join(HOME,'neuroglia')
+    data_home = get_neuroglia_data_home(data_home=HOME)
+    assert data_home==EXPECTED