Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: zero numeric validation function #249

Merged
merged 1 commit into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions dcs_core/core/common/models/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ class ValidationFunction(str, Enum):
PERCENTILE_60 = "percentile_60"
PERCENTILE_80 = "percentile_80"
PERCENTILE_90 = "percentile_90"
COUNT_ZERO = "count_zero"
PERCENT_ZERO = "percent_zero"

# Reliability validations 3
COUNT_ROWS = "count_rows"
Expand Down
27 changes: 27 additions & 0 deletions dcs_core/core/datasource/sql_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,3 +622,30 @@ def query_get_percentile(
if filters:
query += f" WHERE {filters}"
return round(self.fetchone(query)[0], 2)

def query_zero_metric(
self, table: str, field: str, operation: str, filters: str = None
) -> Union[int, float]:
qualified_table_name = self.qualified_table_name(table)

zero_query = f"SELECT COUNT(*) FROM {qualified_table_name} WHERE {field} = 0"

if filters:
zero_query += f" AND {filters}"

if operation == "percent":
total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name}"
if filters:
total_count_query += f" WHERE {filters}"

zero_count = self.fetchone(zero_query)[0]
total_count = self.fetchone(total_count_query)[0]

if total_count == 0:
return 0.0

result = (zero_count / total_count) * 100
return round(result, 2)
else:
result = self.fetchone(zero_query)[0]
return result
4 changes: 4 additions & 0 deletions dcs_core/core/validation/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,15 @@
)
from dcs_core.core.validation.numeric_validation import ( # noqa F401 this is used in globals
AvgValidation,
CountZeroValidation,
MaxValidation,
MinValidation,
Percentile20Validation,
Percentile40Validation,
Percentile60Validation,
Percentile80Validation,
Percentile90Validation,
PercentZeroValidation,
StdDevValidation,
SumValidation,
VarianceValidation,
Expand Down Expand Up @@ -157,6 +159,8 @@ class ValidationManager:
ValidationFunction.PERCENTILE_60.value: "Percentile60Validation",
ValidationFunction.PERCENTILE_80.value: "Percentile80Validation",
ValidationFunction.PERCENTILE_90.value: "Percentile90Validation",
ValidationFunction.COUNT_ZERO.value: "CountZeroValidation",
ValidationFunction.PERCENT_ZERO.value: "PercentZeroValidation",
}

def __init__(
Expand Down
26 changes: 26 additions & 0 deletions dcs_core/core/validation/numeric_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,29 @@ def _generate_metric_value(self, **kwargs) -> float:
)
else:
raise ValueError("Unsupported data source type for Percentile90Validation")


class CountZeroValidation(Validation):
def _generate_metric_value(self, **kwargs) -> int:
if isinstance(self.data_source, SQLDataSource):
return self.data_source.query_zero_metric(
table=self.dataset_name,
field=self.field_name,
operation="count",
filters=self.where_filter if self.where_filter is not None else None,
)
else:
raise ValueError("Unsupported data source type for CountZeroValidation")


class PercentZeroValidation(Validation):
def _generate_metric_value(self, **kwargs) -> float:
if isinstance(self.data_source, SQLDataSource):
return self.data_source.query_zero_metric(
table=self.dataset_name,
field=self.field_name,
operation="percent",
filters=self.where_filter if self.where_filter is not None else None,
)
else:
raise ValueError("Unsupported data source type for PercentZeroValidation")
32 changes: 31 additions & 1 deletion docs/validations/validity.md
Original file line number Diff line number Diff line change
Expand Up @@ -445,4 +445,34 @@ The percent permid validation checks the percentage of valid permid in a dataset
validations for product_db.products:
- percent_permid_of_user:
on: percent_permid(perm_id)
```
```

## Zero Value Validations

Zero value validations are used to identify and validate fields that contain zero values, which are important for datasets where zero values might have specific implications, such as indicating missing or invalid data, or representing real-world conditions.

## `COUNT_ZERO`

`COUNT_ZERO` is used to count the number of rows where the specified field contains a zero value. It can be useful for detecting cases where zero might represent missing data or special conditions.

**Example:**

```yaml
validations for product_db.products:
- price zero count:
on: count_zero(price)
threshold: "< 52"
```

## `PERCENT_ZERO`

`PERCENT_ZERO` is used to calculate the percentage of rows where the specified field contains a zero value. This helps assess the proportion of zero values in a column, allowing the user to enforce percentage-based thresholds for data quality.

**Example:**

```yaml
validations for product_db.products:
- price zero percent:
on: percent_zero(price)
threshold: "< 10"
```
49 changes: 29 additions & 20 deletions examples/configurations/postgres/example_postgres_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,35 @@ validations for iris_pgsql.dcs_iris:
- sepal length stddev size:
on: stddev(sepal_length)
threshold: "< 0.5"
# Percentile Validations
- sepal length 20th percentile:
on: percentile_20(sepal_length)
threshold: "> 10"

- sepal length 40th percentile:
on: percentile_40(sepal_length)
threshold: "> 20"

- sepal length 60th percentile:
on: percentile_60(sepal_length)
threshold: "> 30"

- sepal length 80th percentile:
on: percentile_80(sepal_length)
threshold: "> 40"

- sepal length 90th percentile:
on: percentile_90(sepal_length)
threshold: "> 50"

# Zero Numeric Validations
- sepal length zero count:
on: count_zero(sepal_length)
threshold: "< 5"
- sepal length zero percent:
on: percent_zero(sepal_length)
threshold: "< 10"


# Uniqueness Metrics
- species duplicate count:
Expand Down Expand Up @@ -138,23 +167,3 @@ validations for product_db.products:
on: percent_longitude(longitude)
threshold: "> 80"

# Percentile Validations
- sepal length 20th percentile:
on: percentile_20(sepal_length)
threshold: "> 10"

- sepal length 40th percentile:
on: percentile_40(sepal_length)
threshold: "> 20"

- sepal length 60th percentile:
on: percentile_60(sepal_length)
threshold: "> 30"

- sepal length 80th percentile:
on: percentile_80(sepal_length)
threshold: "> 40"

- sepal length 90th percentile:
on: percentile_90(sepal_length)
threshold: "> 50"
44 changes: 44 additions & 0 deletions tests/core/configuration/test_configuration_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -949,3 +949,47 @@ def test_should_parse_90th_percentile_validation():
.get_validation_function
== ValidationFunction.PERCENTILE_90
)


def test_should_parse_count_zero_validation():
yaml_string = """
validations for product_db.products:
- test:
on: count_zero(price)
threshold: "< 52"
"""
configuration = load_configuration_from_yaml_str(yaml_string)
assert (
configuration.validations["product_db.products"]
.validations["test"]
.get_validation_function
== ValidationFunction.COUNT_ZERO
)
assert (
configuration.validations["product_db.products"]
.validations["test"]
.threshold.lt
== 52
)


def test_should_parse_percent_zero_validation():
yaml_string = """
validations for product_db.products:
- test:
on: percent_zero(price)
threshold: "< 10"
"""
configuration = load_configuration_from_yaml_str(yaml_string)
assert (
configuration.validations["product_db.products"]
.validations["test"]
.get_validation_function
== ValidationFunction.PERCENT_ZERO
)
assert (
configuration.validations["product_db.products"]
.validations["test"]
.threshold.lt
== 10
)
37 changes: 25 additions & 12 deletions tests/integration/datasource/test_sql_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ def setup_tables(
cusip VARCHAR(9),
figi VARCHAR(12),
isin VARCHAR(12),
perm_id VARCHAR(50)
perm_id VARCHAR(50),
salary INTEGER
)
"""
)
Expand All @@ -142,27 +143,27 @@ def setup_tables(
('thor', '{(utc_now - datetime.timedelta(days=10)).strftime("%Y-%m-%d")}',
1500, NULL, 'thor hammer', 'e7194aaa-5516-4362-a5ff-6ff971976bec',
'123-456-7890', 'jane.doe@domain', 'C2', 'ABCDE', 40.0678, -7555555554.0060,'856-45-6789','0067340',
'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789'), -- invalid email -- invalid usa_state_code -- invalid usa_zip_code -- invalid cusip -- invalid perm_id
'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789', 0), -- invalid email -- invalid usa_state_code -- invalid usa_zip_code -- invalid cusip -- invalid perm_id
('captain america', '{(utc_now - datetime.timedelta(days=3)).strftime("%Y-%m-%d")}',
90, 80, 'shield', 'e7194aaa-5516-4362-a5ff-6ff971976b', '(123) 456-7890',
'john.doe@.com ', 'NY', '12-345', 34.0522, -118.2437,'000-12-3456', 'B01HL06',
'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789'), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn
'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789', 1000), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn
('iron man', '{(utc_now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")}',
50, 70, 'suit', '1739c676-6108-4dd2-8984-2459df744936', '123 456 7890',
'contact@company..org', 'XY', '85001', 37.7749, -122.4194,'859-99-9999','4155586',
'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678'), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id
'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678', 0), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id
('hawk eye', '{(utc_now - datetime.timedelta(days=5)).strftime("%Y-%m-%d")}',
40, 60, 'bow', '1739c676-6108-4dd2-8984-2459df746', '+1 123-456-7890',
'user@@example.com', 'TX', '30301', 51.1657, 10.4515,'123-45-67890','12345',
'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291'), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol
'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291', 90), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol
('clark kent', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}',
35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8', '09123.456.7890',
'contact@company.org', 'ZZ', '123456', 51.5074, -0.1278,'666-45-6789','34A56B7',
'6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X'), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id
'6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X', 0), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id
('black widow', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}',
35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8032c73', '+1 (123) 456-7890',
'jane_smith123@domain.co.uk', 'FL', '90210', 483.8566, 2.3522,'001-01-0001','456VGHY',
'0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890') -- invalid isin -- invalid sedol
'0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890', 70) -- invalid isin -- invalid sedol
"""

postgresql_connection.execute(text(insert_query))
Expand Down Expand Up @@ -578,36 +579,48 @@ def test_should_return_20th_percentile_age(
percentile_20 = postgres_datasource.query_get_percentile(
table=self.TABLE_NAME, field="age", percentile=0.2
)
assert percentile_20 == 35 # Expected 20th percentile value.
assert percentile_20 == 35

def test_should_return_40th_percentile_age(
self, postgres_datasource: PostgresDataSource
):
percentile_40 = postgres_datasource.query_get_percentile(
table=self.TABLE_NAME, field="age", percentile=0.4
)
assert percentile_40 == 40 # Expected 40th percentile value.
assert percentile_40 == 40

def test_should_return_60th_percentile_age(
self, postgres_datasource: PostgresDataSource
):
percentile_60 = postgres_datasource.query_get_percentile(
table=self.TABLE_NAME, field="age", percentile=0.6
)
assert percentile_60 == 50 # Expected 60th percentile value.
assert percentile_60 == 50

def test_should_return_80th_percentile_age(
self, postgres_datasource: PostgresDataSource
):
percentile_80 = postgres_datasource.query_get_percentile(
table=self.TABLE_NAME, field="age", percentile=0.8
)
assert percentile_80 == 90 # Expected 80th percentile value.
assert percentile_80 == 90

def test_should_return_90th_percentile_age(
self, postgres_datasource: PostgresDataSource
):
percentile_90 = postgres_datasource.query_get_percentile(
table=self.TABLE_NAME, field="age", percentile=0.9
)
assert percentile_90 == 1500 # Expected 90th percentile value.
assert percentile_90 == 1500

def test_should_return_count_zero(self, postgres_datasource: PostgresDataSource):
count_zero = postgres_datasource.query_zero_metric(
table=self.TABLE_NAME, field="salary", operation="count"
)
assert count_zero == 3

def test_should_return_percent_zero(self, postgres_datasource: PostgresDataSource):
percent_zero = postgres_datasource.query_zero_metric(
table=self.TABLE_NAME, field="salary", operation="percent"
)
assert round(percent_zero, 2) == 50.0
Loading