Skip to content

Commit

Permalink
Merge pull request #7 from SomaLogic/master
Browse files Browse the repository at this point in the history
Canopy 2022Q2. Addition of annotations lifting capability and revision of eLoD calculation.
  • Loading branch information
tjohnson-somalogic authored May 3, 2022
2 parents c16575a + bde0f25 commit 7e94bc1
Show file tree
Hide file tree
Showing 10 changed files with 311 additions and 32 deletions.
4 changes: 2 additions & 2 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@

Canopy™

Copyright © 2020 SomaLogic, Inc.
Copyright © 2022 SomaLogic Operating Company, Inc.

Permission is hereby granted, free of charge, to any person obtaining a copy of the Canopy software
and associated documentation files (the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions outlined below. Further, Canopy and SomaLogic are trademarks
owned by SomaLogic, Inc. No license is hereby granted to these trademarks other than for purposes of
owned by SomaLogic Operating Company, Inc. No license is hereby granted to these trademarks other than for purposes of
identifying the origin or source of the Software.

The above copyright notice and this permission notice shall be included in all copies or substantial
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://choosealicense.

## Overview

This document accompanies the Python package `canopy`, which loads the SomaLogic, Inc. proprietary data file called an `*.adat`. The package provides auxiliary functions for extracting relevant information from the ADAT object once in the Python environment. Basic familiarity with the Python environment is assumed, as is the ability to install contributed packages from the Python Package Installer (pip)
This document accompanies the Python package `canopy`, which loads the SomaLogic Operating Company, Inc. proprietary data file called an `*.adat`. The package provides auxiliary functions for extracting relevant information from the ADAT object once in the Python environment. Basic familiarity with the Python environment is assumed, as is the ability to install contributed packages from the Python Package Installer (pip)

-----

Expand Down
185 changes: 185 additions & 0 deletions canopy/annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
from __future__ import annotations
import pandas as pd
from canopy import Adat
from . import errors
from copy import deepcopy
import re


LIFTING_COLUMN_REGEX = r'\w+ Scalar v\d+.\d+ to v\d+.\d+'


class Annotations(pd.DataFrame):
"""A Pandas `DataFrame` object with additional functionality to help with annotations specific needs.
Modeled after:
https://github.com/geopandas
On subclassing the pandas dataframe:
https://pandas.pydata.org/pandas-docs/stable/development/extending.html#subclassing-pandas-data-structures
"""

_metadata = [
'supported_lifting_matrices',
'supported_lifting_signal_space',
]

def __init__(self, *args, **kwargs) -> None:
super(Annotations, self).__init__(*args, **kwargs)
self._update_supported_lifting_options()

@property
def _constructor(self) -> Adat:
self._update_supported_lifting_options()
return Annotations

def __setitem__(self, key, val):
super().__setitem__(key, val)
if re.match(LIFTING_COLUMN_REGEX, key):
self._update_supported_lifting_options()

def __delitem__(self, key) -> None:
super().__delitem__(key)
if re.match(LIFTING_COLUMN_REGEX, key):
self._update_supported_lifting_options()

def _update_supported_lifting_options(self):
self.supported_lifting_matrices = set()
self.supported_lifting_signal_space = set()

for name in self.columns:
if re.match(LIFTING_COLUMN_REGEX, name):
supported_info = name.split(' ')
self.supported_lifting_matrices.add(supported_info[0])
self.supported_lifting_signal_space.add((supported_info[2], supported_info[4]))

def update_adat_column_meta(self, adat: Adat) -> Adat:
"""Utility to update a provided adat's column metadata to match the annotations object.
Attempts to update the following column metadata in the adat:
- SomaId
- Target
- TargetFullName
- UniProt
- Type
- Organism
- EntrezGeneSymbol
- ExtrezGeneID
Parameters
----------
adat : Adat
Canopy Adat object
Returns
-------
updated_adat : Adat
Canopy Adat object with updated column metadata
Examples
--------
>>> updated_adat = Annotations.update_adat_column_meta(adat)
"""

xlsx_to_adat_column_map = {
'SomaId': 'SomaId',
'Target Name': 'Target',
'Target Full Name': 'TargetFullName',
'UniProt ID': 'UniProt',
'Type': 'Type',
'Organism': 'Organism',
'Entrez Gene Name': 'EntrezGeneSymbol',
'Entrez Gene ID': 'EntrezGeneID',
}

seq_ids = self.index.get_level_values('SeqId')
mod_adat = adat.copy()
for xlsx_col, adat_col in xlsx_to_adat_column_map.items():
if adat_col not in adat.columns.names:
continue
values_dict = {seq_id: col_meta for seq_id, col_meta in zip(seq_ids, self[xlsx_col].values)}
mod_adat = mod_adat.replace_keyed_meta(
axis=1,
replaced_meta_name=adat_col,
key_meta_name='SeqId',
values_dict=values_dict
)
return mod_adat

def supported_lifting_space_str(self):
ret_str = ''
for space in self.supported_lifting_signal_space:
ret_str += f'from "{space[0]}" to "{space[1]}"'
return ret_str

def lift_adat(self, adat: Adat, lift_to_version: str = None) -> Adat:
"""Utility to perform lifting on an adat.
Parameters
----------
adat : Adat
Canopy Adat object
Returns
-------
lifted_adat : Adat
Canopy Adat object with scaled RFU
Examples
--------
>>> lifted_adat = Annotations.lift_adat(adat=adat)
"""
self._update_supported_lifting_options()

# Perform checks to see if this bridging is appropriate for this adat
adat = adat.copy()
process_steps = adat.header_metadata['!ProcessSteps']
scalable_norm_steps = 'Raw RFU, Hyb Normalization, medNormInt (SampleId), plateScale, Calibration, anmlQC, qcCheck, anmlSMP'
if not process_steps.startswith(scalable_norm_steps):
raise errors.AnnotationsLiftingError(f'ANML normalized SOMAscan data is required for lifting. Provided norm steps: "{process_steps}"')

# Get matrix from adat header metdata
matrix = adat.header_metadata['StudyMatrix']
if matrix == 'EDTA Plasma': # Takes care of the EDTA Plasma --> Plasma conversion so we can look up the column in the annotations df
matrix = 'Plasma'
if matrix not in self.supported_lifting_matrices:
raise errors.AnnotationsLiftingError(f'Unsupported matrix: "{matrix}". Supported matrices: {", ".join(self.supported_lifting_matrices)}.')

# Get assay version from adat header metadata. Prefer SignalSpace (created by lifting apps) if it exists
if 'SignalSpace' in adat.header_metadata:
signal_space = adat.header_metadata['SignalSpace']
else:
signal_space = adat.header_metadata['!AssayVersion']
if signal_space.lower() == 'v4': # Takes care of the v4 and V4 --> v4.0 conversion so we can look up the column in the annotations df
signal_space = 'v4.0'

# Check to see if we can perform this lifting with the assay version(s) provided
if lift_to_version:
if (signal_space, lift_to_version) not in self.supported_lifting_signal_space:
raise errors.AnnotationsLiftingError(f'Unsupported lifting from "{signal_space}" to "{lift_to_version}". Supported lifting: {self.supported_lifting_space_str()}.')
else:
possible_lifts = [lift for lift in self.supported_lifting_signal_space if lift[0] == signal_space]
if not possible_lifts:
raise errors.AnnotationsLiftingError(f'Unsupported lifting from: "{signal_space}". Supported lifting: {self.supported_lifting_space_str()}.')
elif len(possible_lifts) > 1:
raise errors.AnnotationsLiftingError(f'Too many lifting options. Please provide a value for the argument "lift_to_version". Supported lifting: {self.supported_lifting_space_str()}.')
else:
lift_to_version = possible_lifts[0][1]

# Build column name & get scalars
an_lifting_column = f'{matrix} Scalar {signal_space} to {lift_to_version}'
scalars = self[an_lifting_column]

# Check if seq ids will broadcast between adat & annotations (symmetric difference)
sym_diff = set(scalars.index) ^ set(adat.columns.get_level_values('SeqId'))
if sym_diff:
raise errors.AnnotationsLiftingError('Unable to perform lifting due to analyte mismatch between adat & annotations. Has either file been modified?')

# Scale adat
scaled_adat = adat.multiply(scalars, axis='columns', level='SeqId')
scaled_adat.header_metadata = deepcopy(adat.header_metadata)
scaled_adat.header_metadata['!ProcessSteps'] += f', Annotation Lift ({signal_space} to {lift_to_version})'
scaled_adat.header_metadata['SignalSpace'] = lift_to_version

return scaled_adat
30 changes: 12 additions & 18 deletions canopy/base/adat_math_helpers.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,25 @@
import numpy as np
import pandas as pd

def calcELOD(x:pd.Series): # x is a pandas series
med = np.median(x)
absDiff = np.abs(x-med)
medDiff = np.median(absDiff)
eLOD = med+3*(1.4826*medDiff)
return eLOD


class AdatMathHelpers:
"""A collection of methods to help with performing common and standard computations on the adat.
"""

def e_lod(self, groupby=None):
"""Computes estimated limit of detection of the buffer samples by plate as defined by median(somamer_rfu) + 4.9 * mad(somamer_rfu).
Parameters
----------
groupby : List(str)
Returns
-------
e_lod_df : pd.DataFrame
Examples
--------
>>> e_lod_df = adat.e_lod()
>>> e_lod_df = adat.e_lod(groupby='SampleId')
"""
groupby = groupby or 'PlateId'
e_lod_df = self.groupby(groupby).apply(lambda x: x.median() + 4.9 * x.mad())
def e_lod_by_reagent(self):
df = self.pick_on_meta(axis=0, name='SampleType', values=['Buffer'])
df.columns = df.columns.get_level_values('SeqId')
e_lod_df = df.apply(calcELOD)
return e_lod_df


@staticmethod
def _compute_intra_cv(adat, groupby):
sums = {}
Expand Down
4 changes: 2 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
# -- Project information -----------------------------------------------------

project = 'canopy'
copyright = '2020, SomaLogic'
author = 'SomaLogic, Inc.'
copyright = '2022, SomaLogic Operating Company, Inc'
author = 'SomaLogic Operating Company, Inc.'

# The full version, including alpha/beta/rc tags
release = pkg_resources.require('canopy')[0].version
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
setup(
name='canopy',
python_requires='>3.8.0',
version='0.3',
version='0.4',
packages=find_packages(),
url='http://confluence.sladmin.com/display/INGEN/Canopy',
license='',
Expand Down
8 changes: 3 additions & 5 deletions tests/data/control_data.adat

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion tests/test_adat_reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_column_metadata_names(self):
])

def test_header_metadata_size(self):
self.assertEqual(len(self.adat.header_metadata.keys()), 40)
self.assertEqual(len(self.adat.header_metadata.keys()), 38)

def test_header_metadata_spot_check(self):
self.assertEqual(self.adat.header_metadata['HybNormReference'], 'intraplate')
Expand Down
8 changes: 6 additions & 2 deletions tests/test_adat_writing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

def require_side_effect(*args, **kwargs):
class Version():
version = '0.0.1'
version = '0.2.1'
return [Version()]


Expand All @@ -16,8 +16,12 @@ class AdatWritingTest(TestCase):
"""

filename = './tests/data/control_data_written.adat'
source_filename = './tests/data/control_data.adat'

def setUp(self):
self.source_md5 = hashlib.md5()
with open(self.source_filename, 'rb') as f:
self.source_md5.update(f.read())
self.adat = canopy.read_adat('./tests/data/control_data.adat')

def tearDown(self):
Expand All @@ -34,7 +38,7 @@ def test_adat_md5(self):
hash_md5 = hashlib.md5()
with open(self.filename, 'rb') as f:
hash_md5.update(f.read())
self.assertEqual(hash_md5.hexdigest(), '58d675e7e43cc4e142ff94ac6efdfd94')
self.assertEqual(hash_md5.hexdigest(), 'e288d94404e1e30a2138b188daa9a7e9')


def require_side_effect_0_2(*args, **kwargs):
Expand Down
Loading

0 comments on commit 7e94bc1

Please sign in to comment.