Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add YAML-based threshold input options for find_outliers #13

Merged
merged 2 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 61 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ python = ">=3.9,<3.13"
pandas = "^2.2.2"
scipy = "^1.13.0"
pyarrow = "^16.0.0"
pyyaml = "^6.0.1"

[tool.poetry.group.dev.dependencies]
pytest = "^8.2.0"
Expand Down
69 changes: 64 additions & 5 deletions src/cosmicqc/analyze.py
d33bs marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,24 @@
"""

import operator
import pathlib
from functools import reduce
from typing import Dict, List
from typing import Dict, List, Optional, Union

import pandas as pd
import yaml
from scipy.stats import zscore as scipy_zscore

DEFAULT_QC_THRESHOLD_FILE = (
f"{pathlib.Path(__file__).parent!s}/data/qc_thresholds_default.yml"
)


def find_outliers(
df: pd.DataFrame, feature_thresholds: Dict[str, float], metadata_columns: List[str]
df: pd.DataFrame,
metadata_columns: List[str],
feature_thresholds: Union[Dict[str, float], str],
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
) -> pd.DataFrame:
"""
This function uses z-scoring to format the data for detecting outlier
Expand All @@ -23,18 +32,31 @@ def find_outliers(
Args:
df: pd.DataFrame
Data frame with converted output from CytoTable.
metadata_columns: List[str]
List of metadata columns that should be outputted with the outlier data.
feature_thresholds: Dict[str, float]
Dictionary with the feature name(s) as the key(s) and their assigned
One of two options:
A dictionary with the feature name(s) as the key(s) and their assigned
threshold for identifying outliers. Positive int for the threshold
will detect outliers "above" than the mean, negative int will detect
outliers "below" the mean.
metadata_columns: List[str]
List of metadata columns that should be outputted with the outlier data.
Or a string which is a named key reference found within
the feature_thresholds_file yaml file.
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
An optional feature thresholds file where thresholds may be
defined within a file.

Returns:
pd.DataFrame:
Outlier data frame for the given conditions.
"""

if isinstance(feature_thresholds, str):
feature_thresholds = read_thresholds_set_from_file(
feature_thresholds=feature_thresholds,
feature_thresholds_file=feature_thresholds_file,
)

# Create z-score columns for each feature to reference during outlier detection
zscore_columns = {}
for feature in feature_thresholds:
Expand Down Expand Up @@ -71,3 +93,40 @@ def find_outliers(

# Return outliers DataFrame with specified columns
return outliers_df[columns_to_include]


def read_thresholds_set_from_file(
feature_thresholds: str, feature_thresholds_file: str
):
"""
Reads a set of feature thresholds from a specified file.

This function takes the path to a feature thresholds file and a
specific feature threshold string, reads the file, and returns
the thresholds set from the file.

Args:
feature_thresholds (str):
A string specifying the feature thresholds.
feature_thresholds_file (str):
The path to the file containing feature thresholds.

Returns:
dict: A dictionary containing the processed feature thresholds.

Raises:
LookupError: If the file does not contain the specified feature_thresholds key.
"""

with open(feature_thresholds_file, "r") as file:
thresholds = yaml.safe_load(file)

if feature_thresholds not in thresholds["thresholds"]:
raise LookupError(
(
f"Unable to find threshold set by name {feature_thresholds}"
f" within {feature_thresholds_file}"
)
)

return thresholds["thresholds"][feature_thresholds]
16 changes: 16 additions & 0 deletions src/cosmicqc/data/qc_thresholds_default.yml
d33bs marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# defines threshold sets for running qc procedures as part of this project.
versions:
cellprofiler: ">=4.2.4"
thresholds:
# Set a negative threshold to identify both outlier small nuclei
# and low formfactor representing non-circular segmentations.
small_and_low_formfactor_nuclei:
Nuclei_AreaShape_Area: -1
Nuclei_AreaShape_FormFactor: -1
# find very elongated nuclei segmentations (above mean)
elongated_nuclei:
Nuclei_AreaShape_Eccentricity: 2
# find large nuclei segmentations (above mean) and low formfactor
large_nuclei:
Nuclei_AreaShape_Area: 2
Nuclei_AreaShape_FormFactor: -2
97 changes: 97 additions & 0 deletions tests/test_analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import pandas as pd
import pytest
from cosmicqc import analyze


Expand Down Expand Up @@ -183,3 +184,99 @@ def test_find_outliers_cfret(cytotable_CFReT_data_df: pd.DataFrame):
14811: "f01",
},
}


def test_read_thresholds_set_from_file():
"""
Tests read_thresholds_set_from_file
"""

# test that an exception is raised on receiving a bad
# lookup value from the thresholds file.
with pytest.raises(LookupError):
analyze.read_thresholds_set_from_file(
feature_thresholds="bad_lookup_value",
feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE,
)

# test default threshold sets
assert analyze.read_thresholds_set_from_file(
feature_thresholds="small_and_low_formfactor_nuclei",
feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE,
) == {"Nuclei_AreaShape_Area": -1, "Nuclei_AreaShape_FormFactor": -1}

assert analyze.read_thresholds_set_from_file(
feature_thresholds="elongated_nuclei",
feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE,
) == {"Nuclei_AreaShape_Eccentricity": 2}

assert analyze.read_thresholds_set_from_file(
feature_thresholds="large_nuclei",
feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE,
) == {"Nuclei_AreaShape_Area": 2, "Nuclei_AreaShape_FormFactor": -2}


def test_find_outliers_dict_and_default_config_cfret(
cytotable_CFReT_data_df: pd.DataFrame,
):
"""
Testing find_outliers with dictionary vs yaml threshold sets
using CytoTable CFReT data.
"""

# metadata columns to include in output data frame
metadata_columns = [
"Image_Metadata_Plate",
"Image_Metadata_Well",
"Image_Metadata_Site",
]

# test that the output is the same from dict vs yaml
pd.testing.assert_frame_equal(
analyze.find_outliers(
df=cytotable_CFReT_data_df,
feature_thresholds={
"Nuclei_AreaShape_Area": -1,
"Nuclei_AreaShape_FormFactor": -1,
},
metadata_columns=metadata_columns,
),
analyze.find_outliers(
df=cytotable_CFReT_data_df,
feature_thresholds="small_and_low_formfactor_nuclei",
metadata_columns=metadata_columns,
),
)

# test that the output is the same from dict vs yaml
pd.testing.assert_frame_equal(
analyze.find_outliers(
df=cytotable_CFReT_data_df,
feature_thresholds={
"Nuclei_AreaShape_Eccentricity": 2,
},
metadata_columns=metadata_columns,
),
analyze.find_outliers(
df=cytotable_CFReT_data_df,
feature_thresholds="elongated_nuclei",
metadata_columns=metadata_columns,
),
)

# test that the output is the same from dict vs yaml
pd.testing.assert_frame_equal(
analyze.find_outliers(
df=cytotable_CFReT_data_df,
feature_thresholds={
"Nuclei_AreaShape_Area": 2,
"Nuclei_AreaShape_FormFactor": -2,
},
metadata_columns=metadata_columns,
),
analyze.find_outliers(
df=cytotable_CFReT_data_df,
feature_thresholds="large_nuclei",
metadata_columns=metadata_columns,
),
)
Loading