Add CLI for coSMicQC (#34)

* Create qcdataframe.py * linting * add qcdataframe * linting * adding tests * linting * update name, tests * add back compat for self type * back compat for isinstance * linting * add cli for cosmicqc * linting * add tests and wrappers * linting and test adjustment * attempting wrapper * patch python-fire; fix tests * add docstring to top of test * add csv.gz compatibility * add export capabilities * rename file to correct module name * add export capabilities * add output capabilities * Apply suggestions from code review Co-authored-by: Gregory Way <gregory.way@gmail.com> * update tests and docs * fix tests * update tests; add constructor path for scdataframe * linting * modify tests * enable pd.series compatibility * update for exports via cli * fix docstring * add return types for test util * fix deps * add to docs on exports * add docs for context * note about ignore rule * remove todo * minor comment about display * retain code comment * correct code comment ---------
WayScience · Jun 26, 2024 · 037fbce · 037fbce
1 parent 197bc8a
commit 037fbce
Show file tree

Hide file tree

Showing 9 changed files with 416 additions and 45 deletions.
diff --git a/example.csv b/example.csv
@@ -0,0 +1,11 @@
+,example_feature
+0,1
+1,2
+2,3
+3,4
+4,5
+5,6
+6,7
+7,8
+8,9
+9,10
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,10 +19,14 @@ scipy = [
 ]
 pyarrow = "^16.0.0"
 pyyaml = "^6.0.1"
+fire = "^0.6.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.2.0"
 
+[tool.poetry.scripts]
+cosmicqc = "cosmicqc.cli:cli_analyze"
+
 [tool.isort]
 profile = "black"
 

diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py
@@ -23,6 +23,7 @@ def identify_outliers(
     feature_thresholds: Union[Dict[str, float], str],
     feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
     include_threshold_scores: bool = False,
+    export_path: Optional[str] = None,
 ) -> Union[pd.Series, pd.DataFrame]:
     """
     This function uses z-scoring to format the data for detecting outlier
@@ -35,8 +36,6 @@ def identify_outliers(
         df: Union[SCDataFrame, pd.DataFrame, str]
             DataFrame or file string-based filepath of a
             Parquet, CSV, or TSV file with CytoTable output or similar data.
-        metadata_columns: List[str]
-            List of metadata columns that should be outputted with the outlier data.
         feature_thresholds: Dict[str, float]
             One of two options:
             A dictionary with the feature name(s) as the key(s) and their assigned
@@ -48,6 +47,13 @@ def identify_outliers(
         feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
             An optional feature thresholds file where thresholds may be
             defined within a file.
+        include_threshold_scores: bool
+            Whether to include the threshold scores in addition to whether
+            the threshold set passes per row.
+        export_path: Optional[str] = None
+            An optional path to export the data using SCDataFrame export
+            capabilities. If None no export is performed.
+            Note: compatible exports are CSV's, TSV's, and parquet.
 
     Returns:
         Union[pd.Series, pd.DataFrame]:
@@ -95,7 +101,7 @@ def identify_outliers(
             condition = outlier_df[zscore_columns[feature]] < threshold
         conditions.append(condition)
 
-    return (
+    result = (
         # create a boolean pd.series identifier for dataframe
         # based on all conditions for use within other functions.
         reduce(operator.and_, conditions)
@@ -111,12 +117,18 @@ def identify_outliers(
         )
     )
 
+    if export_path is not None:
+        SCDataFrame(data=result).export(file_path=export_path)
+
+    return result
+
 
 def find_outliers(
     df: Union[SCDataFrame, pd.DataFrame, str],
     metadata_columns: List[str],
     feature_thresholds: Union[Dict[str, float], str],
     feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
+    export_path: Optional[str] = None,
 ) -> pd.DataFrame:
     """
     This function uses identify_outliers to return a dataframe
@@ -139,6 +151,10 @@ def find_outliers(
         feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
             An optional feature thresholds file where thresholds may be
             defined within a file.
+        export_path: Optional[str] = None
+            An optional path to export the data using SCDataFrame export
+            capabilities. If None no export is performed.
+            Note: compatible exports are CSV's, TSV's, and parquet.
 
     Returns:
         pd.DataFrame:
@@ -174,15 +190,22 @@ def find_outliers(
     # Include metadata columns in the output DataFrame
     columns_to_include = list(feature_thresholds.keys()) + metadata_columns
 
+    result = outliers_df[columns_to_include]
+
+    # export the file if specified
+    if export_path is not None:
+        SCDataFrame(data=result).export(file_path=export_path)
+
     # Return outliers DataFrame with specified columns
-    return outliers_df[columns_to_include]
+    return result
 
 
 def label_outliers(
     df: Union[SCDataFrame, pd.DataFrame, str],
     feature_thresholds: Optional[Union[Dict[str, float], str]] = None,
     feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
     include_threshold_scores: bool = False,
+    export_path: Optional[str] = None,
 ) -> pd.DataFrame:
     """
     Use identify_outliers to label the original dataset for
@@ -206,6 +229,10 @@ def label_outliers(
             include_threshold_scores: bool = False
                 Whether to include the scores in addition to whether an outlier
                 was detected or not.
+            export_path: Optional[str] = None
+                An optional path to export the data using SCDataFrame export
+                capabilities. If None no export is performed.
+                Note: compatible exports are CSV's, TSV's, and parquet.
 
         Returns:
             pd.DataFrame:
@@ -224,7 +251,7 @@ def label_outliers(
             feature_thresholds_file=feature_thresholds_file,
             include_threshold_scores=include_threshold_scores,
         )
-        return pd.concat(
+        result = pd.concat(
             [
                 df,
                 (
@@ -265,7 +292,13 @@ def label_outliers(
             axis=1,
         )
         # return a dataframe with a deduplicated columns by name
-        return labeled_df.loc[:, ~labeled_df.columns.duplicated()]
+        result = labeled_df.loc[:, ~labeled_df.columns.duplicated()]
+
+    # export the file if specified
+    if export_path is not None:
+        SCDataFrame(data=result).export(file_path=export_path)
+
+    return result
 
 
 def read_thresholds_set_from_file(