Skip to content

Commit

Permalink
Add CLI for coSMicQC (#34)
Browse files Browse the repository at this point in the history
* Create qcdataframe.py

* linting

* add qcdataframe

* linting

* adding tests

* linting

* update name, tests

* add back compat for self type

* back compat for isinstance

* linting

* add cli for cosmicqc

* linting

* add tests and wrappers

* linting and test adjustment

* attempting wrapper

* patch python-fire; fix tests

* add docstring to top of test

* add csv.gz compatibility

* add export capabilities

* rename file to correct module name

* add export capabilities

* add output capabilities

* Apply suggestions from code review

Co-authored-by: Gregory Way <gregory.way@gmail.com>

* update tests and docs

* fix tests

* update tests; add constructor path for scdataframe

* linting

* modify tests

* enable pd.series compatibility

* update for exports via cli

* fix docstring

* add return types for test util

* fix deps

* add to docs on exports

* add docs for context

* note about ignore rule

* remove todo

* minor comment about display

* retain code comment

* correct code comment

---------
  • Loading branch information
d33bs authored Jun 26, 2024
1 parent 197bc8a commit 037fbce
Show file tree
Hide file tree
Showing 9 changed files with 416 additions and 45 deletions.
11 changes: 11 additions & 0 deletions example.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
,example_feature
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9
9,10
113 changes: 75 additions & 38 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,14 @@ scipy = [
]
pyarrow = "^16.0.0"
pyyaml = "^6.0.1"
fire = "^0.6.0"

[tool.poetry.group.dev.dependencies]
pytest = "^8.2.0"

[tool.poetry.scripts]
cosmicqc = "cosmicqc.cli:cli_analyze"

[tool.isort]
profile = "black"

Expand Down
45 changes: 39 additions & 6 deletions src/cosmicqc/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def identify_outliers(
feature_thresholds: Union[Dict[str, float], str],
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
include_threshold_scores: bool = False,
export_path: Optional[str] = None,
) -> Union[pd.Series, pd.DataFrame]:
"""
This function uses z-scoring to format the data for detecting outlier
Expand All @@ -35,8 +36,6 @@ def identify_outliers(
df: Union[SCDataFrame, pd.DataFrame, str]
DataFrame or file string-based filepath of a
Parquet, CSV, or TSV file with CytoTable output or similar data.
metadata_columns: List[str]
List of metadata columns that should be outputted with the outlier data.
feature_thresholds: Dict[str, float]
One of two options:
A dictionary with the feature name(s) as the key(s) and their assigned
Expand All @@ -48,6 +47,13 @@ def identify_outliers(
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
An optional feature thresholds file where thresholds may be
defined within a file.
include_threshold_scores: bool
Whether to include the threshold scores in addition to whether
the threshold set passes per row.
export_path: Optional[str] = None
An optional path to export the data using SCDataFrame export
capabilities. If None no export is performed.
Note: compatible exports are CSV's, TSV's, and parquet.
Returns:
Union[pd.Series, pd.DataFrame]:
Expand Down Expand Up @@ -95,7 +101,7 @@ def identify_outliers(
condition = outlier_df[zscore_columns[feature]] < threshold
conditions.append(condition)

return (
result = (
# create a boolean pd.series identifier for dataframe
# based on all conditions for use within other functions.
reduce(operator.and_, conditions)
Expand All @@ -111,12 +117,18 @@ def identify_outliers(
)
)

if export_path is not None:
SCDataFrame(data=result).export(file_path=export_path)

return result


def find_outliers(
df: Union[SCDataFrame, pd.DataFrame, str],
metadata_columns: List[str],
feature_thresholds: Union[Dict[str, float], str],
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
export_path: Optional[str] = None,
) -> pd.DataFrame:
"""
This function uses identify_outliers to return a dataframe
Expand All @@ -139,6 +151,10 @@ def find_outliers(
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
An optional feature thresholds file where thresholds may be
defined within a file.
export_path: Optional[str] = None
An optional path to export the data using SCDataFrame export
capabilities. If None no export is performed.
Note: compatible exports are CSV's, TSV's, and parquet.
Returns:
pd.DataFrame:
Expand Down Expand Up @@ -174,15 +190,22 @@ def find_outliers(
# Include metadata columns in the output DataFrame
columns_to_include = list(feature_thresholds.keys()) + metadata_columns

result = outliers_df[columns_to_include]

# export the file if specified
if export_path is not None:
SCDataFrame(data=result).export(file_path=export_path)

# Return outliers DataFrame with specified columns
return outliers_df[columns_to_include]
return result


def label_outliers(
df: Union[SCDataFrame, pd.DataFrame, str],
feature_thresholds: Optional[Union[Dict[str, float], str]] = None,
feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE,
include_threshold_scores: bool = False,
export_path: Optional[str] = None,
) -> pd.DataFrame:
"""
Use identify_outliers to label the original dataset for
Expand All @@ -206,6 +229,10 @@ def label_outliers(
include_threshold_scores: bool = False
Whether to include the scores in addition to whether an outlier
was detected or not.
export_path: Optional[str] = None
An optional path to export the data using SCDataFrame export
capabilities. If None no export is performed.
Note: compatible exports are CSV's, TSV's, and parquet.
Returns:
pd.DataFrame:
Expand All @@ -224,7 +251,7 @@ def label_outliers(
feature_thresholds_file=feature_thresholds_file,
include_threshold_scores=include_threshold_scores,
)
return pd.concat(
result = pd.concat(
[
df,
(
Expand Down Expand Up @@ -265,7 +292,13 @@ def label_outliers(
axis=1,
)
# return a dataframe with a deduplicated columns by name
return labeled_df.loc[:, ~labeled_df.columns.duplicated()]
result = labeled_df.loc[:, ~labeled_df.columns.duplicated()]

# export the file if specified
if export_path is not None:
SCDataFrame(data=result).export(file_path=export_path)

return result


def read_thresholds_set_from_file(
Expand Down
Loading

0 comments on commit 037fbce

Please sign in to comment.