Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: negative value numeric validation function #250

Merged
merged 1 commit into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions dcs_core/core/common/models/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ class ValidationFunction(str, Enum):
PERCENTILE_90 = "percentile_90"
COUNT_ZERO = "count_zero"
PERCENT_ZERO = "percent_zero"
COUNT_NEGATIVE = "count_negative"
PERCENT_NEGATIVE = "percent_negative"

# Reliability validations 3
COUNT_ROWS = "count_rows"
Expand Down
25 changes: 25 additions & 0 deletions dcs_core/core/datasource/sql_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,3 +649,28 @@ def query_zero_metric(
else:
result = self.fetchone(zero_query)[0]
return result

def query_negative_metric(
self, table: str, field: str, operation: str, filters: str = None
) -> Union[int, float]:
qualified_table_name = self.qualified_table_name(table)

negative_query = (
f"SELECT COUNT(*) FROM {qualified_table_name} WHERE {field} < 0"
)

if filters:
negative_query += f" AND {filters}"

total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name}"

if filters:
total_count_query += f" WHERE {filters}"

if operation == "percent":
query = f"SELECT (CAST(({negative_query}) AS float) / CAST(({total_count_query}) AS float)) * 100"
else:
query = negative_query

result = self.fetchone(query)[0]
return round(result, 2) if operation == "percent" else result
4 changes: 4 additions & 0 deletions dcs_core/core/validation/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
)
from dcs_core.core.validation.numeric_validation import ( # noqa F401 this is used in globals
AvgValidation,
CountNegativeValidation,
CountZeroValidation,
MaxValidation,
MinValidation,
Expand All @@ -40,6 +41,7 @@
Percentile60Validation,
Percentile80Validation,
Percentile90Validation,
PercentNegativeValidation,
PercentZeroValidation,
StdDevValidation,
SumValidation,
Expand Down Expand Up @@ -161,6 +163,8 @@ class ValidationManager:
ValidationFunction.PERCENTILE_90.value: "Percentile90Validation",
ValidationFunction.COUNT_ZERO.value: "CountZeroValidation",
ValidationFunction.PERCENT_ZERO.value: "PercentZeroValidation",
ValidationFunction.COUNT_NEGATIVE.value: "CountNegativeValidation",
ValidationFunction.PERCENT_NEGATIVE.value: "PercentNegativeValidation",
}

def __init__(
Expand Down
28 changes: 28 additions & 0 deletions dcs_core/core/validation/numeric_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,3 +216,31 @@ def _generate_metric_value(self, **kwargs) -> float:
)
else:
raise ValueError("Unsupported data source type for PercentZeroValidation")


class CountNegativeValidation(Validation):
def _generate_metric_value(self, **kwargs) -> int:
if isinstance(self.data_source, SQLDataSource):
return self.data_source.query_negative_metric(
table=self.dataset_name,
field=self.field_name,
operation="count",
filters=self.where_filter if self.where_filter is not None else None,
)
else:
raise ValueError("Unsupported data source type for CountNegativeValidation")


class PercentNegativeValidation(Validation):
def _generate_metric_value(self, **kwargs) -> float:
if isinstance(self.data_source, SQLDataSource):
return self.data_source.query_negative_metric(
table=self.dataset_name,
field=self.field_name,
operation="percent",
filters=self.where_filter if self.where_filter is not None else None,
)
else:
raise ValueError(
"Unsupported data source type for PercentNegativeValidation"
)
29 changes: 29 additions & 0 deletions docs/validations/validity.md
Original file line number Diff line number Diff line change
Expand Up @@ -475,4 +475,33 @@ validations for product_db.products:
- price zero percent:
on: percent_zero(price)
threshold: "< 10"

# **Numeric Negative Value Validations**

The Numeric Negative Value Validations detect negative values in numeric fields within a dataset and ensure that they do not exceed or fall below a specified threshold.

## **COUNT_NEGATIVE**

This validation counts the number of negative values present in a given numeric field.

**Example**

```yaml
validations for product_db.products:
- negative value count should be less than 2:
on: count_negative(price)
threshold: "< 2"
```

## **PERCENT_NEGATIVE**

This validation calculates the percentage of negative values in a numeric field, relative to the total number of records.

**Example**

```yaml
validations for product_db.products:
- negative value percentage should be less than 40%:
on: percent_negative(price)
threshold: "< 40"
```
9 changes: 9 additions & 0 deletions examples/configurations/postgres/example_postgres_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,15 @@ validations for iris_pgsql.dcs_iris:
threshold: "< 10"


# **Negative Value Validations**
- price negative value count:
on: count_negative(price)
threshold: "< 2"

- price negative value percentage:
on: percent_negative(price)
threshold: "< 40"

# Uniqueness Metrics
- species duplicate count:
on: count_duplicate(species)
Expand Down
32 changes: 32 additions & 0 deletions tests/core/configuration/test_configuration_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -993,3 +993,35 @@ def test_should_parse_percent_zero_validation():
.threshold.lt
== 10
)


def test_should_parse_count_negative_validation():
yaml_string = """
validations for product_db.products:
- count_negative for price should be less than 2:
on: count_negative(price)
threshold: "< 2"
"""
configuration = load_configuration_from_yaml_str(yaml_string)
assert (
configuration.validations["product_db.products"]
.validations["count_negative for price should be less than 2"]
.get_validation_function
== ValidationFunction.COUNT_NEGATIVE
)


def test_should_parse_percent_negative_validation():
yaml_string = """
validations for product_db.products:
- percent_negative for price should be less than 40%:
on: percent_negative(price)
threshold: "< 40"
"""
configuration = load_configuration_from_yaml_str(yaml_string)
assert (
configuration.validations["product_db.products"]
.validations["percent_negative for price should be less than 40%"]
.get_validation_function
== ValidationFunction.PERCENT_NEGATIVE
)
33 changes: 25 additions & 8 deletions tests/integration/datasource/test_sql_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ def setup_tables(
figi VARCHAR(12),
isin VARCHAR(12),
perm_id VARCHAR(50),
salary INTEGER
salary INTEGER,
price FLOAT
)
"""
)
Expand All @@ -143,27 +144,27 @@ def setup_tables(
('thor', '{(utc_now - datetime.timedelta(days=10)).strftime("%Y-%m-%d")}',
1500, NULL, 'thor hammer', 'e7194aaa-5516-4362-a5ff-6ff971976bec',
'123-456-7890', 'jane.doe@domain', 'C2', 'ABCDE', 40.0678, -7555555554.0060,'856-45-6789','0067340',
'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789', 0), -- invalid email -- invalid usa_state_code -- invalid usa_zip_code -- invalid cusip -- invalid perm_id
'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789', 0, 100.0), -- invalid email -- invalid usa_state_code -- invalid usa_zip_code -- invalid cusip -- invalid perm_id
('captain america', '{(utc_now - datetime.timedelta(days=3)).strftime("%Y-%m-%d")}',
90, 80, 'shield', 'e7194aaa-5516-4362-a5ff-6ff971976b', '(123) 456-7890',
'john.doe@.com ', 'NY', '12-345', 34.0522, -118.2437,'000-12-3456', 'B01HL06',
'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789', 1000), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn
'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789', 1000, -50.0), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn
('iron man', '{(utc_now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")}',
50, 70, 'suit', '1739c676-6108-4dd2-8984-2459df744936', '123 456 7890',
'contact@company..org', 'XY', '85001', 37.7749, -122.4194,'859-99-9999','4155586',
'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678', 0), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id
'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678', 0, -150.0), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id
('hawk eye', '{(utc_now - datetime.timedelta(days=5)).strftime("%Y-%m-%d")}',
40, 60, 'bow', '1739c676-6108-4dd2-8984-2459df746', '+1 123-456-7890',
'user@@example.com', 'TX', '30301', 51.1657, 10.4515,'123-45-67890','12345',
'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291', 90), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol
'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291', 90, 50.0), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol
('clark kent', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}',
35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8', '09123.456.7890',
'contact@company.org', 'ZZ', '123456', 51.5074, -0.1278,'666-45-6789','34A56B7',
'6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X', 0), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id
'6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X', 0, -25.0), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id
('black widow', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}',
35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8032c73', '+1 (123) 456-7890',
'jane_smith123@domain.co.uk', 'FL', '90210', 483.8566, 2.3522,'001-01-0001','456VGHY',
'0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890', 70) -- invalid isin -- invalid sedol
'0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890', 70, 30.0) -- invalid isin -- invalid sedol
"""

postgresql_connection.execute(text(insert_query))
Expand Down Expand Up @@ -611,7 +612,7 @@ def test_should_return_90th_percentile_age(
percentile_90 = postgres_datasource.query_get_percentile(
table=self.TABLE_NAME, field="age", percentile=0.9
)
assert percentile_90 == 1500
assert percentile_90 == 1500 # Expected 90th percentile value.

def test_should_return_count_zero(self, postgres_datasource: PostgresDataSource):
count_zero = postgres_datasource.query_zero_metric(
Expand All @@ -624,3 +625,19 @@ def test_should_return_percent_zero(self, postgres_datasource: PostgresDataSourc
table=self.TABLE_NAME, field="salary", operation="percent"
)
assert round(percent_zero, 2) == 50.0

def test_should_return_count_negative(
self, postgres_datasource: PostgresDataSource
):
count_negative = postgres_datasource.query_negative_metric(
table=self.TABLE_NAME, field="price", operation="count"
)
assert count_negative == 3

def test_should_return_percent_negative(
self, postgres_datasource: PostgresDataSource
):
percent_negative = postgres_datasource.query_negative_metric(
table=self.TABLE_NAME, field="price", operation="percent"
)
assert round(percent_negative, 2) == 50.0
Loading