Update SCDataFrame name to CytoDataFrame (#50)

* update scdataframe name to cytodataframe * remove persisted version from init
WayScience · Jul 15, 2024 · 6b86ccc · 6b86ccc
1 parent 1bfa792
commit 6b86ccc
Show file tree

Hide file tree

Showing 7 changed files with 109 additions and 113 deletions.
diff --git a/docs/examples/cosmicqc_in_a_nutshell.ipynb b/docs/examples/cosmicqc_in_a_nutshell.ipynb
diff --git a/docs/examples/cosmicqc_in_a_nutshell.py b/docs/examples/cosmicqc_in_a_nutshell.py
@@ -31,8 +31,8 @@
 # set a context directory for images associated with the dataset
 image_context_dir = pathlib.Path(data_path).parent / "Plate_2_images"
 
-# create a cosmicqc SCDataFrame (single-cell DataFrame)
-scdf = cosmicqc.SCDataFrame(data=data_path, data_context_dir=image_context_dir)
+# create a cosmicqc CytoDataFrame (single-cell DataFrame)
+scdf = cosmicqc.CytoDataFrame(data=data_path, data_context_dir=image_context_dir)
 
 # display the dataframe
 scdf
@@ -72,7 +72,7 @@
 # for each threshold set in the new columns
 labeled_scdf.show_report()
 
-# show cropped images through SCDataFrame from the dataset to help analyze outliers
+# show cropped images through CytoDataFrame from the dataset to help analyze outliers
 labeled_scdf.sort_values(by="cqc.large_nuclei.is_outlier", ascending=False)[
     [
         "Metadata_ImageNumber",
@@ -84,7 +84,7 @@
     ]
 ]
 
-# One can convert from cosmicqc.SCDataFrame to pd.DataFrame's
+# One can convert from cosmicqc.CytoDataFrame to pd.DataFrame's
 # (when or if needed!)
 df = pd.DataFrame(scdf)
 print(type(df))

diff --git a/pyproject.toml b/pyproject.toml
@@ -105,11 +105,6 @@ vcs = "git"
 [tool.poetry-dynamic-versioning.substitution]
 files = ["src/cosmicqc/__init__.py"]
 
-# set persistent versions within the __init__.py file in cases
-# where we may not have or want access to full git history
-[tool.poetry-dynamic-versioning.files."src/cosmicqc/__init__.py"]
-persistent-substitution = true
-
 [build-system]
 requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
 build-backend = "poetry_dynamic_versioning.backend"

diff --git a/src/cosmicqc/__init__.py b/src/cosmicqc/__init__.py
@@ -3,7 +3,7 @@
 """
 
 from .analyze import find_outliers
-from .frame import SCDataFrame
+from .frame import CytoDataFrame
 
 # note: version placeholder is updated during build
 # by poetry-dynamic-versioning.

diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py
@@ -11,20 +11,20 @@
 import yaml
 from scipy.stats import zscore as scipy_zscore
 
-from .frame import SCDataFrame
+from .frame import CytoDataFrame
 
 DEFAULT_QC_THRESHOLD_FILE = (
     f"{pathlib.Path(__file__).parent!s}/data/qc_nuclei_thresholds_default.yml"
 )
 
 
 def identify_outliers(
-    df: Union[SCDataFrame, pd.DataFrame, str],
+    df: Union[CytoDataFrame, pd.DataFrame, str],
     feature_thresholds: Union[Dict[str, float], str],
     feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
     include_threshold_scores: bool = False,
     export_path: Optional[str] = None,
-) -> Union[pd.Series, SCDataFrame]:
+) -> Union[pd.Series, CytoDataFrame]:
     """
     This function uses z-scoring to format the data for detecting outlier
     nuclei or cells using specific CellProfiler features. Thresholds are
@@ -33,7 +33,7 @@ def identify_outliers(
     threshold of 0 as that would represent the whole dataset.
 
     Args:
-        df: Union[SCDataFrame, pd.DataFrame, str]
+        df: Union[CytoDataFrame, pd.DataFrame, str]
             DataFrame or file string-based filepath of a
             Parquet, CSV, or TSV file with CytoTable output or similar data.
         feature_thresholds: Dict[str, float]
@@ -51,18 +51,18 @@ def identify_outliers(
             Whether to include the threshold scores in addition to whether
             the threshold set passes per row.
         export_path: Optional[str] = None
-            An optional path to export the data using SCDataFrame export
+            An optional path to export the data using CytoDataFrame export
             capabilities. If None no export is performed.
             Note: compatible exports are CSV's, TSV's, and parquet.
 
     Returns:
-        Union[pd.Series, SCDataFrame]:
+        Union[pd.Series, CytoDataFrame]:
             Outlier series with booleans based on whether outliers were detected
             or not for use within other functions.
     """
 
-    # interpret the df as SCDataFrame
-    df = SCDataFrame(data=df)
+    # interpret the df as CytoDataFrame
+    df = CytoDataFrame(data=df)
 
     # create a copy of the dataframe to ensure
     # we don't modify the supplied dataframe inplace.
@@ -109,12 +109,12 @@ def identify_outliers(
         reduce(operator.and_, conditions)
         if not include_threshold_scores
         # otherwise, provide the threshold zscore col and the above column
-        else SCDataFrame(
+        else CytoDataFrame(
             data=pd.concat(
                 [
                     # grab only the outlier zscore columns from the outlier_df
                     outlier_df[zscore_columns.values()],
-                    SCDataFrame(
+                    CytoDataFrame(
                         {
                             f"{thresholds_name}.is_outlier": reduce(
                                 operator.and_, conditions
@@ -130,15 +130,15 @@ def identify_outliers(
 
     if export_path is not None:
         if isinstance(result, pd.Series):
-            SCDataFrame(result).export(file_path=export_path)
+            CytoDataFrame(result).export(file_path=export_path)
         else:
             result.export(file_path=export_path)
 
     return result
 
 
 def find_outliers(
-    df: Union[SCDataFrame, pd.DataFrame, str],
+    df: Union[CytoDataFrame, pd.DataFrame, str],
     metadata_columns: List[str],
     feature_thresholds: Union[Dict[str, float], str],
     feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
@@ -149,7 +149,7 @@ def find_outliers(
     with only the outliers and provided metadata columns.
 
     Args:
-        df: Union[SCDataFrame, pd.DataFrame, str]
+        df: Union[CytoDataFrame, pd.DataFrame, str]
             DataFrame or file string-based filepath of a
             Parquet, CSV, or TSV file with CytoTable output or similar data.
         metadata_columns: List[str]
@@ -166,7 +166,7 @@ def find_outliers(
             An optional feature thresholds file where thresholds may be
             defined within a file.
         export_path: Optional[str] = None
-            An optional path to export the data using SCDataFrame export
+            An optional path to export the data using CytoDataFrame export
             capabilities. If None no export is performed.
             Note: compatible exports are CSV's, TSV's, and parquet.
 
@@ -175,8 +175,8 @@ def find_outliers(
             Outlier data frame for the given conditions.
     """
 
-    # interpret the df as SCDataFrame
-    df = SCDataFrame(data=df)
+    # interpret the df as CytoDataFrame
+    df = CytoDataFrame(data=df)
 
     if isinstance(feature_thresholds, str):
         feature_thresholds = read_thresholds_set_from_file(
@@ -215,20 +215,20 @@ def find_outliers(
 
 
 def label_outliers(  # noqa: PLR0913
-    df: Union[SCDataFrame, pd.DataFrame, str],
+    df: Union[CytoDataFrame, pd.DataFrame, str],
     feature_thresholds: Optional[Union[Dict[str, float], str]] = None,
     feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
     include_threshold_scores: bool = False,
     export_path: Optional[str] = None,
     report_path: Optional[str] = None,
     **kwargs: Dict[str, Any],
-) -> SCDataFrame:
+) -> CytoDataFrame:
     """
     Use identify_outliers to label the original dataset for
     where a cell passed or failed the quality control condition(s).
 
         Args:
-            df: Union[SCDataFrame, pd.DataFrame, str]
+            df: Union[CytoDataFrame, pd.DataFrame, str]
                 DataFrame or file string-based filepath of a
                 Parquet, CSV, or TSV file with CytoTable output or similar data.
             feature_thresholds: Dict[str, float]
@@ -246,17 +246,17 @@ def label_outliers(  # noqa: PLR0913
                 Whether to include the scores in addition to whether an outlier
                 was detected or not.
             export_path: Optional[str] = None
-                An optional path to export the data using SCDataFrame export
+                An optional path to export the data using CytoDataFrame export
                 capabilities. If None no export is performed.
                 Note: compatible exports are CSV's, TSV's, and parquet.
 
         Returns:
-            SCDataFrame:
+            CytoDataFrame:
                 Full dataframe with optional scores and outlier boolean column.
     """
 
-    # interpret the df as SCDataFrame
-    df = SCDataFrame(data=df)
+    # interpret the df as CytoDataFrame
+    df = CytoDataFrame(data=df)
 
     # for single outlier processing
     if isinstance(feature_thresholds, (str, dict)):
@@ -268,14 +268,14 @@ def label_outliers(  # noqa: PLR0913
             include_threshold_scores=include_threshold_scores,
         )
 
-        result = SCDataFrame(
+        result = CytoDataFrame(
             data=pd.concat(
                 [
                     df,
                     (
                         identified_outliers
                         if isinstance(identified_outliers, pd.DataFrame)
-                        else SCDataFrame(
+                        else CytoDataFrame(
                             {
                                 (
                                     f"cqc.{feature_thresholds}.is_outlier"
@@ -312,7 +312,7 @@ def label_outliers(  # noqa: PLR0913
             axis=1,
         )
         # return a dataframe with a deduplicated columns by name
-        result = SCDataFrame(
+        result = CytoDataFrame(
             labeled_df.loc[:, ~labeled_df.columns.duplicated()],
             data_context_dir=df._custom_attrs["data_context_dir"],
         )