Merge pull request #6 from int-brain-lab/merfish

merfish cell data download
int-brain-lab · Feb 8, 2024 · fd78ad0 · fd78ad0
2 parents 3bcec73 + 5051739
commit fd78ad0
Show file tree

Hide file tree

Showing 9 changed files with 513 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,12 @@
-## [1.0.0]
+## [0.5.0]
+### Added
+- `iblatlas.genomics.merfish` module for working with the Allen gene expression
+ atlas in the `iblatlas.genomics.merfish` module
 
+## [0.4.0]
 ### Added
-- `iblatlas.genomics` module for working with genomics data from Allen contains
-  - the Allen gene expression atlas in the `iblatlas.genomics.agea` module
-  - the Allen cell types atlas in the `iblatlas.genomics.merfish` module
+- `iblatlas.genomics.agea` module for working with the Allen gene expression
+ atlas in the `iblatlas.genomics.agea` module
 ### Modified
 - slices of the atlas are now always returned with consistent sizes regardless of the volume layout on disk
 - atlases now can have an extra dimension in the image volume, to allow for multiple layers

diff --git a/examples/atlas_genomics_load_merfish.ipynb b/examples/atlas_genomics_load_merfish.ipynb
diff --git a/iblatlas/__init__.py b/iblatlas/__init__.py
@@ -194,4 +194,4 @@
 .. [10] Allen Mouse Common Coordinate Framework Technical White Paper (October 2017 v3)
    http://help.brain-map.org/download/attachments/8323525/Mouse_Common_Coordinate_Framework.pdf
 """
-__version__ = '0.4.0'
+__version__ = '0.5.0'
diff --git a/iblatlas/genomics/__init__.py b/iblatlas/genomics/__init__.py
@@ -0,0 +1,79 @@
+"""A package for working with Allen genomics datasets: AGEA and MERFISH.
+
+AGEA
+----
+
+This package provides a way to load the Allen Gene Expression volumes.
+The 4345 volumes have been registered and formatted into a binary file.
+
+     agea/
+     ├── gene-expression.bin
+     ├── gene-expression.pqt
+     ├── image.npy
+     └── label.npy
+
+-   gene-expression.bin is a float-16 binary file containing the gene expression volumes.
+In c-order, the dimensions are (4345, 58, 41, 67) that corresponds to (nexperiments, ml, dv, ap) at 200 um.
+-   gene-expression.pqt is a parquet file describing the 4345 genes corresponding to the
+gene expression volumes.
+-   image.npy: the Allen atlas diffusion imaging volume downsampled at the gene expression resolution
+-   label.npy: the Allen atlas region label volume downsampled at the gene expression resolution
+See the building scripts in ./genomics/gene_expression_scrapping/05-generate-atlas-templates.py
+
+[1] E. S. Lein et al., “Genome-wide atlas of gene expression in the adult mouse brain,”
+ Nature, vol. 445, no. 7124, Art. no. 7124, Jan. 2007, doi: 10.1038/nature05453.
+[2] L. Ng et al., “An anatomic gene expression atlas of the adult mouse brain,”
+ Nat Neurosci, vol. 12, no. 3, Art. no. 3, Mar. 2009, doi: 10.1038/nn.2281.
+
+
+MERFISH
+-------
+
+Spatially registered cell types from single cell transcriptomics data.
+
+This package provides a way to load the MERFISH data from the Allen Brain Cell Atlas.
+We formatted the original CSV files from the 2023-12-15 release into parquet files for faster loading and smaller hard
+drive footprint.
+
+    merfish/
+    ├── genes.pqt
+    ├── neurotransmitters.pqt
+    ├── classes.pqt
+    ├── subclasses.pqt
+    ├── supertypes.pqt
+    ├── clusters.pqt
+    ├── C57BL6J-638850_cells.pqt
+    ├── Zhuang-ABCA-1_cells.pqt
+    ├── Zhuang-ABCA-2_cells.pqt
+    ├── Zhuang-ABCA-3_cells.pqt
+    └── Zhuang-ABCA-4_cells.pqt
+
+-   *_cells.pqt: Each dataframe corresponds to a given subject. The concatenation of those 5 dataframes lead to
+8_879_868, 11 cells with the following columns:
+    -   'brain_section_label': the label of the brain section (subject and section): "Zhuang-ABCA-1.085"
+    -   'donor_label': the label of the subject
+     -  'neurotransmitter': neurotransmitter label {<NA>, 'Glut', 'Chol', 'GABA-Glyc', 'GABA','Dopa',
+     'Glut-GABA', 'Hist', 'Sero', 'Nora'}
+     -  'class': direct index of the class record in df_classes
+     -  'subclass': direct index of the subclass record in df_subclasses
+     -  'supertype': direct index of the supertype record in df_supertypes
+     -  'cluster': direct index of the cluster record in df_clusters
+     -  'x', 'y', 'z': coordinates of the cell in IBL space (see: iblatlas.atlas.AllenAtlas)
+     -  'Allen_id': allen region unique identifier
+
+The cells are classified hierarchically, from high level to low level: classes, subclasses, supertypes and clusters.
+-   df_classes: a dataframe of classes (35, 3), where each record corresponds to a single class
+-   df_subclasses: a dataframe of subclasses (339, 4), where each record corresponds to a single subclass
+-   df_supertypes: a dataframe of supertypes (1202, 4), where each record corresponds to a single supertype
+-   df_clusters: a dataframe of clusters (5323, 5), where each record corresponds to a single cluster
+
+Additional metadata:
+-   df_neurotransmitters: a dataframe of neurotransmitters (9, 2), index is the neurotransmitter label
+-   df_genes: a dataframe of genes (1122), this could be used in conjunction with raw gene expressions data (not implemented)
+
+[1] Z. Yao et al., “A high-resolution transcriptomic and spatial atlas of cell types in the whole mouse brain,”
+ Nature, vol. 624, no. 7991, Art. no. 7991, Dec. 2023, doi: 10.1038/s41586-023-06812-z.
+[2] M. Zhang et al., “Molecularly defined and spatially resolved cell atlas of the whole mouse brain,”
+ Nature, vol. 624, no. 7991, Art. no. 7991, Dec. 2023, doi: 10.1038/s41586-023-06808-9.
+
+"""
diff --git a/iblatlas/genomics/agea.py b/iblatlas/genomics/agea.py
@@ -1,8 +1,4 @@
-"""
-[1] E. S. Lein et al., “Genome-wide atlas of gene expression in the adult mouse brain,”
- Nature, vol. 445, no. 7124, Art. no. 7124, Jan. 2007, doi: 10.1038/nature05453.
-[2] L. Ng et al., “An anatomic gene expression atlas of the adult mouse brain,”
- Nat Neurosci, vol. 12, no. 3, Art. no. 3, Mar. 2009, doi: 10.1038/nn.2281.
+"""A package for loading 4345 formatted and registered gene expression volumes
 """
 import logging
 from pathlib import Path

diff --git a/iblatlas/genomics/merfish.py b/iblatlas/genomics/merfish.py
@@ -0,0 +1,63 @@
+import logging
+from pathlib import Path
+
+import pandas as pd
+import numpy as np
+
+import one.remote.aws as aws
+
+from iblatlas import atlas
+
+_logger = logging.getLogger(__name__)
+
+
+def load(folder_cache=None):
+    """
+    Reads in the Allen gene expression experiments tables
+    :param folder_cache:
+    :return:
+    df_cells: a dataframe of cells (8_879_868, 11), where each record corresponds to a single cell
+    df_classes: a dataframe of classes (35, 3), where each record corresponds to a single class
+    df_subclasses: a dataframe of subclasses (339, 4), where each record corresponds to a single subclass
+    df_supertypes: a dataframe of supertypes (1202, 4), where each record corresponds to a single supertype
+    df_clusters: a dataframe of clusters (5323, 5), where each record corresponds to a single cluster
+    df_genes: a dataframe of genes (1672, 4), where each record corresponds to a single gene
+    df_neurotransmitters: a dataframe of neurotransmitters (9, 2), where each record corresponds to a single
+     neurotransmitter
+    """
+    OLD_VERSIONS = ['2023-06-12']
+    folder_cache = Path(folder_cache or atlas.AllenAtlas._get_cache_dir().joinpath('merfish'))
+    # check the AWS version and download the files if needed
+    version_flag = next(folder_cache.glob('*.version'), None)
+    if version_flag is None or version_flag.stem in OLD_VERSIONS:
+        _logger.info(f'downloading gene expression data from {aws.S3_BUCKET_IBL} s3 bucket...')
+        aws.s3_download_folder('atlas/merfish', folder_cache)
+    # it is faster and more memory efficient to read the parquet files with dask, but we do
+    # not want to require dask as a dependency so we provide the pandas alternative
+    try:
+        import dask.dataframe as dd
+        df_cells = dd.read_parquet(list(folder_cache.rglob('*_cells.pqt')))
+        df_cells = df_cells.compute()
+    except ImportError:
+        df_cells = pd.concat([pd.read_parquet(f) for f in folder_cache.rglob('*_cells.pqt')])
+    # reads in the other tables
+    df_classes = pd.read_parquet(folder_cache.joinpath('classes.pqt'))
+    df_subclasses = pd.read_parquet(folder_cache.joinpath('subclasses.pqt'))
+    df_supertypes = pd.read_parquet(folder_cache.joinpath('supertypes.pqt'))
+    df_clusters = pd.read_parquet(folder_cache.joinpath('clusters.pqt'))
+    df_genes = pd.read_parquet(folder_cache.joinpath('genes.pqt'))
+    df_neurotransmitters = pd.read_parquet(folder_cache.joinpath('neurotransmitters.pqt'))
+    return df_cells, df_classes, df_subclasses, df_supertypes, df_clusters, df_genes, df_neurotransmitters
+
+
+def int2rgb(array, dtype=None):
+    """
+    One liner to convert rgba values stored as integer in dataframes
+    :param array: rgba column of a dataframe or slice of the column
+    :param dtype: optional, if int will return the uint8 view from 0-255 else will return floats from 0-1
+    :return:
+    """
+    if dtype in (int, np.int8):
+        return np.array(array).view('uint8').reshape(array.shape[0], 4)
+    else:
+        return np.array(array).view('uint8').reshape(array.shape[0], 4).astype(float) / 255
diff --git a/iblatlas/genomics/merfish_scrapping/00_download_data.py b/iblatlas/genomics/merfish_scrapping/00_download_data.py
@@ -0,0 +1,44 @@
+"""
+Downloads the data from the Allen Brain Cell Atlas
+
+Some useful resources as of 2024-01-31:
+# https://alleninstitute.github.io/abc_atlas_access/notebooks/zhuang_merfish_tutorial.html
+# https://allen-brain-cell-atlas.s3.us-west-2.amazonaws.com/index.html
+
+# https://allen-brain-cell-atlas.s3.amazonaws.com/index.html
+# https://ibl-brain-wide-map-public.s3.amazonaws.com/index.html
+
+# the explorer setup is described by AWS here: https://github.com/awslabs/aws-js-s3-explorer/tree/master
+"""
+from pathlib import Path
+import os
+import json
+import requests
+from one.remote import aws
+
+version = '20231215'
+version = '20230830'
+download_base = '/datadisk/Data/merfish_atlas/cache'
+
+use_local_cache = False
+manifest_path = 'releases/%s/manifest.json' % version
+
+if not use_local_cache:
+    url = 'https://allen-brain-cell-atlas.s3-us-west-2.amazonaws.com/' + manifest_path
+    manifest = json.loads(requests.get(url).text)
+else:
+    file = os.path.join(download_base, manifest_path)
+    with open(file, 'rb') as f:
+        manifest = json.load(f)
+
+s3_allen, bucket_name = aws.get_s3_allen()
+for r in manifest['directory_listing']:
+    r_dict = manifest['directory_listing'][r]
+    for d in r_dict['directories']:
+        if d != 'metadata':
+            continue
+        d_dict = r_dict['directories'][d]
+        local_path = Path(download_base).joinpath(d_dict['relative_path'])
+        print(local_path)
+        # !aws s3 ls s3://allen-brain-cell-atlas/metadata/Zhuang-ABCA-1/20231215/
+        aws.s3_download_folder(d_dict['relative_path'], local_path, s3_allen, bucket_name)