Merge pull request #7 from SomaLogic/master

Canopy 2022Q2. Addition of annotations lifting capability and revision of eLoD calculation.
SomaLogic · May 3, 2022 · 7e94bc1 · 7e94bc1
2 parents c16575a + bde0f25
commit 7e94bc1
Show file tree

Hide file tree

Showing 10 changed files with 311 additions and 32 deletions.
diff --git a/LICENSE b/LICENSE
@@ -2,14 +2,14 @@
 
 Canopy™
 
-Copyright © 2020 SomaLogic, Inc.
+Copyright © 2022 SomaLogic Operating Company, Inc.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of the Canopy software
 and associated documentation files (the "Software"), to deal in the Software without restriction,
 including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
 and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
 subject to the following conditions outlined below.  Further, Canopy and SomaLogic are trademarks
-owned by SomaLogic, Inc.  No license is hereby granted to these trademarks other than for purposes of
+owned by SomaLogic Operating Company, Inc.  No license is hereby granted to these trademarks other than for purposes of
 identifying the origin or source of the Software.
 
 The above copyright notice and this permission notice shall be included in all copies or substantial

diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://choosealicense.
 
 ## Overview
 
-This document accompanies the Python package `canopy`, which loads the SomaLogic, Inc. proprietary data file called an `*.adat`. The package provides auxiliary functions for extracting relevant information from the ADAT object once in the Python environment. Basic familiarity with the Python environment is assumed, as is the ability to install contributed packages from the Python Package Installer (pip)
+This document accompanies the Python package `canopy`, which loads the SomaLogic Operating Company, Inc. proprietary data file called an `*.adat`. The package provides auxiliary functions for extracting relevant information from the ADAT object once in the Python environment. Basic familiarity with the Python environment is assumed, as is the ability to install contributed packages from the Python Package Installer (pip)
 
 -----
 

diff --git a/canopy/annotations.py b/canopy/annotations.py
@@ -0,0 +1,185 @@
+from __future__ import annotations
+import pandas as pd
+from canopy import Adat
+from . import errors
+from copy import deepcopy
+import re
+
+
+LIFTING_COLUMN_REGEX = r'\w+ Scalar v\d+.\d+ to v\d+.\d+'
+
+
+class Annotations(pd.DataFrame):
+    """A Pandas `DataFrame` object with additional functionality to help with annotations specific needs.
+
+
+    Modeled after:
+    https://github.com/geopandas
+
+    On subclassing the pandas dataframe:
+    https://pandas.pydata.org/pandas-docs/stable/development/extending.html#subclassing-pandas-data-structures
+    """
+
+    _metadata = [
+        'supported_lifting_matrices',
+        'supported_lifting_signal_space',
+    ]
+
+    def __init__(self, *args, **kwargs) -> None:
+        super(Annotations, self).__init__(*args, **kwargs)
+        self._update_supported_lifting_options()
+
+    @property
+    def _constructor(self) -> Adat:
+        self._update_supported_lifting_options()
+        return Annotations
+
+    def __setitem__(self, key, val):
+        super().__setitem__(key, val)
+        if re.match(LIFTING_COLUMN_REGEX, key):
+            self._update_supported_lifting_options()
+
+    def __delitem__(self, key) -> None:
+        super().__delitem__(key)
+        if re.match(LIFTING_COLUMN_REGEX, key):
+            self._update_supported_lifting_options()
+
+    def _update_supported_lifting_options(self):
+        self.supported_lifting_matrices = set()
+        self.supported_lifting_signal_space = set()
+
+        for name in self.columns:
+            if re.match(LIFTING_COLUMN_REGEX, name):
+                supported_info = name.split(' ')
+                self.supported_lifting_matrices.add(supported_info[0])
+                self.supported_lifting_signal_space.add((supported_info[2], supported_info[4]))
+
+    def update_adat_column_meta(self, adat: Adat) -> Adat:
+        """Utility to update a provided adat's column metadata to match the annotations object.
+
+        Attempts to update the following column metadata in the adat:
+         - SomaId
+         - Target
+         - TargetFullName
+         - UniProt
+         - Type
+         - Organism
+         - EntrezGeneSymbol
+         - ExtrezGeneID
+
+        Parameters
+        ----------
+        adat : Adat
+            Canopy Adat object
+
+        Returns
+        -------
+        updated_adat : Adat
+            Canopy Adat object with updated column metadata
+
+        Examples
+        --------
+        >>> updated_adat = Annotations.update_adat_column_meta(adat)
+        """
+
+        xlsx_to_adat_column_map = {
+            'SomaId': 'SomaId',
+            'Target Name': 'Target',
+            'Target Full Name': 'TargetFullName',
+            'UniProt ID': 'UniProt',
+            'Type': 'Type',
+            'Organism': 'Organism',
+            'Entrez Gene Name': 'EntrezGeneSymbol',
+            'Entrez Gene ID': 'EntrezGeneID',
+        }
+
+        seq_ids = self.index.get_level_values('SeqId')
+        mod_adat = adat.copy()
+        for xlsx_col, adat_col in xlsx_to_adat_column_map.items():
+            if adat_col not in adat.columns.names:
+                continue
+            values_dict = {seq_id: col_meta for seq_id, col_meta in zip(seq_ids, self[xlsx_col].values)}
+            mod_adat = mod_adat.replace_keyed_meta(
+                axis=1,
+                replaced_meta_name=adat_col,
+                key_meta_name='SeqId',
+                values_dict=values_dict
+            )
+        return mod_adat
+
+    def supported_lifting_space_str(self):
+        ret_str = ''
+        for space in self.supported_lifting_signal_space:
+            ret_str += f'from "{space[0]}" to "{space[1]}"'
+        return ret_str
+
+    def lift_adat(self, adat: Adat, lift_to_version: str = None) -> Adat:
+        """Utility to perform lifting on an adat.
+
+        Parameters
+        ----------
+        adat : Adat
+            Canopy Adat object
+
+        Returns
+        -------
+        lifted_adat : Adat
+            Canopy Adat object with scaled RFU
+
+        Examples
+        --------
+        >>> lifted_adat = Annotations.lift_adat(adat=adat)
+        """
+        self._update_supported_lifting_options()
+
+        # Perform checks to see if this bridging is appropriate for this adat
+        adat = adat.copy()
+        process_steps = adat.header_metadata['!ProcessSteps']
+        scalable_norm_steps = 'Raw RFU, Hyb Normalization, medNormInt (SampleId), plateScale, Calibration, anmlQC, qcCheck, anmlSMP'
+        if not process_steps.startswith(scalable_norm_steps):
+            raise errors.AnnotationsLiftingError(f'ANML normalized SOMAscan data is required for lifting. Provided norm steps: "{process_steps}"')
+
+        # Get matrix from adat header metdata
+        matrix = adat.header_metadata['StudyMatrix']
+        if matrix == 'EDTA Plasma':  # Takes care of the EDTA Plasma --> Plasma conversion so we can look up the column in the annotations df
+            matrix = 'Plasma'
+        if matrix not in self.supported_lifting_matrices:
+            raise errors.AnnotationsLiftingError(f'Unsupported matrix: "{matrix}". Supported matrices: {", ".join(self.supported_lifting_matrices)}.')
+
+        # Get assay version from adat header metadata. Prefer SignalSpace (created by lifting apps) if it exists
+        if 'SignalSpace' in adat.header_metadata:
+            signal_space = adat.header_metadata['SignalSpace']
+        else:
+            signal_space = adat.header_metadata['!AssayVersion']
+        if signal_space.lower() == 'v4':  # Takes care of the v4 and V4 --> v4.0 conversion so we can look up the column in the annotations df
+            signal_space = 'v4.0'
+
+        # Check to see if we can perform this lifting with the assay version(s) provided
+        if lift_to_version:
+            if (signal_space, lift_to_version) not in self.supported_lifting_signal_space:
+                raise errors.AnnotationsLiftingError(f'Unsupported lifting from "{signal_space}" to "{lift_to_version}". Supported lifting: {self.supported_lifting_space_str()}.')
+        else:
+            possible_lifts = [lift for lift in self.supported_lifting_signal_space if lift[0] == signal_space]
+            if not possible_lifts:
+                raise errors.AnnotationsLiftingError(f'Unsupported lifting from: "{signal_space}". Supported lifting: {self.supported_lifting_space_str()}.')
+            elif len(possible_lifts) > 1:
+                raise errors.AnnotationsLiftingError(f'Too many lifting options. Please provide a value for the argument "lift_to_version". Supported lifting: {self.supported_lifting_space_str()}.')
+            else:
+                lift_to_version = possible_lifts[0][1]
+
+        # Build column name & get scalars
+        an_lifting_column = f'{matrix} Scalar {signal_space} to {lift_to_version}'
+        scalars = self[an_lifting_column]
+
+        # Check if seq ids will broadcast between adat & annotations (symmetric difference)
+        sym_diff = set(scalars.index) ^ set(adat.columns.get_level_values('SeqId'))
+        if sym_diff:
+            raise errors.AnnotationsLiftingError('Unable to perform lifting due to analyte mismatch between adat & annotations. Has either file been modified?')
+
+        # Scale adat
+        scaled_adat = adat.multiply(scalars, axis='columns', level='SeqId')
+        scaled_adat.header_metadata = deepcopy(adat.header_metadata)
+        scaled_adat.header_metadata['!ProcessSteps'] += f', Annotation Lift ({signal_space} to {lift_to_version})'
+        scaled_adat.header_metadata['SignalSpace'] = lift_to_version
+
+        return scaled_adat
diff --git a/canopy/base/adat_math_helpers.py b/canopy/base/adat_math_helpers.py
@@ -1,31 +1,25 @@
 import numpy as np
 import pandas as pd
 
+def calcELOD(x:pd.Series): # x is a pandas series
+    med = np.median(x)
+    absDiff = np.abs(x-med)
+    medDiff = np.median(absDiff)
+    eLOD = med+3*(1.4826*medDiff)
+    return eLOD
+
 
 class AdatMathHelpers:
     """A collection of methods to help with performing common and standard computations on the adat.
     """
 
-    def e_lod(self, groupby=None):
-        """Computes estimated limit of detection of the buffer samples by plate as defined by median(somamer_rfu) + 4.9 * mad(somamer_rfu).
-
-        Parameters
-        ----------
-        groupby : List(str)
-
-        Returns
-        -------
-        e_lod_df : pd.DataFrame
-
-        Examples
-        --------
-        >>> e_lod_df = adat.e_lod()
-        >>> e_lod_df = adat.e_lod(groupby='SampleId')
-        """
-        groupby = groupby or 'PlateId'
-        e_lod_df = self.groupby(groupby).apply(lambda x: x.median() + 4.9 * x.mad())
+    def e_lod_by_reagent(self):
+        df = self.pick_on_meta(axis=0, name='SampleType', values=['Buffer'])
+        df.columns = df.columns.get_level_values('SeqId')
+        e_lod_df = df.apply(calcELOD)
         return e_lod_df
 
+
     @staticmethod
     def _compute_intra_cv(adat, groupby):
         sums = {}

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -19,8 +19,8 @@
 # -- Project information -----------------------------------------------------
 
 project = 'canopy'
-copyright = '2020, SomaLogic'
-author = 'SomaLogic, Inc.'
+copyright = '2022, SomaLogic Operating Company, Inc'
+author = 'SomaLogic Operating Company, Inc.'
 
 # The full version, including alpha/beta/rc tags
 release = pkg_resources.require('canopy')[0].version

diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 setup(
     name='canopy',
     python_requires='>3.8.0',
-    version='0.3',
+    version='0.4',
     packages=find_packages(),
     url='http://confluence.sladmin.com/display/INGEN/Canopy',
     license='',

diff --git a/tests/data/control_data.adat b/tests/data/control_data.adat
diff --git a/tests/test_adat_reading.py b/tests/test_adat_reading.py
@@ -55,7 +55,7 @@ def test_column_metadata_names(self):
         ])
 
     def test_header_metadata_size(self):
-        self.assertEqual(len(self.adat.header_metadata.keys()), 40)
+        self.assertEqual(len(self.adat.header_metadata.keys()), 38)
 
     def test_header_metadata_spot_check(self):
         self.assertEqual(self.adat.header_metadata['HybNormReference'], 'intraplate')

diff --git a/tests/test_adat_writing.py b/tests/test_adat_writing.py
@@ -7,7 +7,7 @@
 
 def require_side_effect(*args, **kwargs):
     class Version():
-        version = '0.0.1'
+        version = '0.2.1'
     return [Version()]
 
 
@@ -16,8 +16,12 @@ class AdatWritingTest(TestCase):
     """
 
     filename = './tests/data/control_data_written.adat'
+    source_filename = './tests/data/control_data.adat'
 
     def setUp(self):
+        self.source_md5 = hashlib.md5()
+        with open(self.source_filename, 'rb') as f:
+            self.source_md5.update(f.read())
         self.adat = canopy.read_adat('./tests/data/control_data.adat')
 
     def tearDown(self):
@@ -34,7 +38,7 @@ def test_adat_md5(self):
         hash_md5 = hashlib.md5()
         with open(self.filename, 'rb') as f:
             hash_md5.update(f.read())
-        self.assertEqual(hash_md5.hexdigest(), '58d675e7e43cc4e142ff94ac6efdfd94')
+        self.assertEqual(hash_md5.hexdigest(), 'e288d94404e1e30a2138b188daa9a7e9')
 
 
 def require_side_effect_0_2(*args, **kwargs):
-Original file line number
+Diff line change
@@ Expand Up @@
     ## Overview
-    This document accompanies the Python package `canopy`, which loads the SomaLogic, Inc. proprietary data file called an `*.adat`. The package provides auxiliary functions for extracting relevant information from the ADAT object once in the Python environment. Basic familiarity with the Python environment is assumed, as is the ability to install contributed packages from the Python Package Installer (pip)
+    This document accompanies the Python package `canopy`, which loads the SomaLogic Operating Company, Inc. proprietary data file called an `*.adat`. The package provides auxiliary functions for extracting relevant information from the ADAT object once in the Python environment. Basic familiarity with the Python environment is assumed, as is the ability to install contributed packages from the Python Package Installer (pip)
     -----
@@ Expand Down @@