Skip to content

Commit

Permalink
Update SCDataFrame name to CytoDataFrame (#50)
Browse files Browse the repository at this point in the history
* update scdataframe name to cytodataframe

* remove persisted version from init
  • Loading branch information
d33bs authored Jul 15, 2024
1 parent 1bfa792 commit 6b86ccc
Show file tree
Hide file tree
Showing 7 changed files with 109 additions and 113 deletions.
48 changes: 24 additions & 24 deletions docs/examples/cosmicqc_in_a_nutshell.ipynb

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/examples/cosmicqc_in_a_nutshell.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
# set a context directory for images associated with the dataset
image_context_dir = pathlib.Path(data_path).parent / "Plate_2_images"

# create a cosmicqc SCDataFrame (single-cell DataFrame)
scdf = cosmicqc.SCDataFrame(data=data_path, data_context_dir=image_context_dir)
# create a cosmicqc CytoDataFrame (single-cell DataFrame)
scdf = cosmicqc.CytoDataFrame(data=data_path, data_context_dir=image_context_dir)

# display the dataframe
scdf
Expand Down Expand Up @@ -72,7 +72,7 @@
# for each threshold set in the new columns
labeled_scdf.show_report()

# show cropped images through SCDataFrame from the dataset to help analyze outliers
# show cropped images through CytoDataFrame from the dataset to help analyze outliers
labeled_scdf.sort_values(by="cqc.large_nuclei.is_outlier", ascending=False)[
[
"Metadata_ImageNumber",
Expand All @@ -84,7 +84,7 @@
]
]

# One can convert from cosmicqc.SCDataFrame to pd.DataFrame's
# One can convert from cosmicqc.CytoDataFrame to pd.DataFrame's
# (when or if needed!)
df = pd.DataFrame(scdf)
print(type(df))
Expand Down
5 changes: 0 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,6 @@ vcs = "git"
[tool.poetry-dynamic-versioning.substitution]
files = ["src/cosmicqc/__init__.py"]

# set persistent versions within the __init__.py file in cases
# where we may not have or want access to full git history
[tool.poetry-dynamic-versioning.files."src/cosmicqc/__init__.py"]
persistent-substitution = true

[build-system]
requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
build-backend = "poetry_dynamic_versioning.backend"
Expand Down
2 changes: 1 addition & 1 deletion src/cosmicqc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

from .analyze import find_outliers
from .frame import SCDataFrame
from .frame import CytoDataFrame

# note: version placeholder is updated during build
# by poetry-dynamic-versioning.
Expand Down
52 changes: 26 additions & 26 deletions src/cosmicqc/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,20 @@
import yaml
from scipy.stats import zscore as scipy_zscore

from .frame import SCDataFrame
from .frame import CytoDataFrame

DEFAULT_QC_THRESHOLD_FILE = (
f"{pathlib.Path(__file__).parent!s}/data/qc_nuclei_thresholds_default.yml"
)


def identify_outliers(
df: Union[SCDataFrame, pd.DataFrame, str],
df: Union[CytoDataFrame, pd.DataFrame, str],
feature_thresholds: Union[Dict[str, float], str],
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
include_threshold_scores: bool = False,
export_path: Optional[str] = None,
) -> Union[pd.Series, SCDataFrame]:
) -> Union[pd.Series, CytoDataFrame]:
"""
This function uses z-scoring to format the data for detecting outlier
nuclei or cells using specific CellProfiler features. Thresholds are
Expand All @@ -33,7 +33,7 @@ def identify_outliers(
threshold of 0 as that would represent the whole dataset.
Args:
df: Union[SCDataFrame, pd.DataFrame, str]
df: Union[CytoDataFrame, pd.DataFrame, str]
DataFrame or file string-based filepath of a
Parquet, CSV, or TSV file with CytoTable output or similar data.
feature_thresholds: Dict[str, float]
Expand All @@ -51,18 +51,18 @@ def identify_outliers(
Whether to include the threshold scores in addition to whether
the threshold set passes per row.
export_path: Optional[str] = None
An optional path to export the data using SCDataFrame export
An optional path to export the data using CytoDataFrame export
capabilities. If None no export is performed.
Note: compatible exports are CSV's, TSV's, and parquet.
Returns:
Union[pd.Series, SCDataFrame]:
Union[pd.Series, CytoDataFrame]:
Outlier series with booleans based on whether outliers were detected
or not for use within other functions.
"""

# interpret the df as SCDataFrame
df = SCDataFrame(data=df)
# interpret the df as CytoDataFrame
df = CytoDataFrame(data=df)

# create a copy of the dataframe to ensure
# we don't modify the supplied dataframe inplace.
Expand Down Expand Up @@ -109,12 +109,12 @@ def identify_outliers(
reduce(operator.and_, conditions)
if not include_threshold_scores
# otherwise, provide the threshold zscore col and the above column
else SCDataFrame(
else CytoDataFrame(
data=pd.concat(
[
# grab only the outlier zscore columns from the outlier_df
outlier_df[zscore_columns.values()],
SCDataFrame(
CytoDataFrame(
{
f"{thresholds_name}.is_outlier": reduce(
operator.and_, conditions
Expand All @@ -130,15 +130,15 @@ def identify_outliers(

if export_path is not None:
if isinstance(result, pd.Series):
SCDataFrame(result).export(file_path=export_path)
CytoDataFrame(result).export(file_path=export_path)
else:
result.export(file_path=export_path)

return result


def find_outliers(
df: Union[SCDataFrame, pd.DataFrame, str],
df: Union[CytoDataFrame, pd.DataFrame, str],
metadata_columns: List[str],
feature_thresholds: Union[Dict[str, float], str],
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
Expand All @@ -149,7 +149,7 @@ def find_outliers(
with only the outliers and provided metadata columns.
Args:
df: Union[SCDataFrame, pd.DataFrame, str]
df: Union[CytoDataFrame, pd.DataFrame, str]
DataFrame or file string-based filepath of a
Parquet, CSV, or TSV file with CytoTable output or similar data.
metadata_columns: List[str]
Expand All @@ -166,7 +166,7 @@ def find_outliers(
An optional feature thresholds file where thresholds may be
defined within a file.
export_path: Optional[str] = None
An optional path to export the data using SCDataFrame export
An optional path to export the data using CytoDataFrame export
capabilities. If None no export is performed.
Note: compatible exports are CSV's, TSV's, and parquet.
Expand All @@ -175,8 +175,8 @@ def find_outliers(
Outlier data frame for the given conditions.
"""

# interpret the df as SCDataFrame
df = SCDataFrame(data=df)
# interpret the df as CytoDataFrame
df = CytoDataFrame(data=df)

if isinstance(feature_thresholds, str):
feature_thresholds = read_thresholds_set_from_file(
Expand Down Expand Up @@ -215,20 +215,20 @@ def find_outliers(


def label_outliers( # noqa: PLR0913
df: Union[SCDataFrame, pd.DataFrame, str],
df: Union[CytoDataFrame, pd.DataFrame, str],
feature_thresholds: Optional[Union[Dict[str, float], str]] = None,
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
include_threshold_scores: bool = False,
export_path: Optional[str] = None,
report_path: Optional[str] = None,
**kwargs: Dict[str, Any],
) -> SCDataFrame:
) -> CytoDataFrame:
"""
Use identify_outliers to label the original dataset for
where a cell passed or failed the quality control condition(s).
Args:
df: Union[SCDataFrame, pd.DataFrame, str]
df: Union[CytoDataFrame, pd.DataFrame, str]
DataFrame or file string-based filepath of a
Parquet, CSV, or TSV file with CytoTable output or similar data.
feature_thresholds: Dict[str, float]
Expand All @@ -246,17 +246,17 @@ def label_outliers( # noqa: PLR0913
Whether to include the scores in addition to whether an outlier
was detected or not.
export_path: Optional[str] = None
An optional path to export the data using SCDataFrame export
An optional path to export the data using CytoDataFrame export
capabilities. If None no export is performed.
Note: compatible exports are CSV's, TSV's, and parquet.
Returns:
SCDataFrame:
CytoDataFrame:
Full dataframe with optional scores and outlier boolean column.
"""

# interpret the df as SCDataFrame
df = SCDataFrame(data=df)
# interpret the df as CytoDataFrame
df = CytoDataFrame(data=df)

# for single outlier processing
if isinstance(feature_thresholds, (str, dict)):
Expand All @@ -268,14 +268,14 @@ def label_outliers( # noqa: PLR0913
include_threshold_scores=include_threshold_scores,
)

result = SCDataFrame(
result = CytoDataFrame(
data=pd.concat(
[
df,
(
identified_outliers
if isinstance(identified_outliers, pd.DataFrame)
else SCDataFrame(
else CytoDataFrame(
{
(
f"cqc.{feature_thresholds}.is_outlier"
Expand Down Expand Up @@ -312,7 +312,7 @@ def label_outliers( # noqa: PLR0913
axis=1,
)
# return a dataframe with a deduplicated columns by name
result = SCDataFrame(
result = CytoDataFrame(
labeled_df.loc[:, ~labeled_df.columns.duplicated()],
data_context_dir=df._custom_attrs["data_context_dir"],
)
Expand Down
Loading

0 comments on commit 6b86ccc

Please sign in to comment.