Skip to content

Commit

Permalink
feat: negative value numeric validation function
Browse files Browse the repository at this point in the history
  • Loading branch information
YamoshiCode committed Sep 5, 2024
1 parent 408e58e commit 4665f05
Show file tree
Hide file tree
Showing 8 changed files with 154 additions and 8 deletions.
2 changes: 2 additions & 0 deletions dcs_core/core/common/models/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ class ValidationFunction(str, Enum):
PERCENTILE_90 = "percentile_90"
COUNT_ZERO = "count_zero"
PERCENT_ZERO = "percent_zero"
COUNT_NEGATIVE = "count_negative"
PERCENT_NEGATIVE = "percent_negative"

# Reliability validations 3
COUNT_ROWS = "count_rows"
Expand Down
25 changes: 25 additions & 0 deletions dcs_core/core/datasource/sql_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,3 +649,28 @@ def query_zero_metric(
else:
result = self.fetchone(zero_query)[0]
return result

def query_negative_metric(
self, table: str, field: str, operation: str, filters: str = None
) -> Union[int, float]:
qualified_table_name = self.qualified_table_name(table)

negative_query = (
f"SELECT COUNT(*) FROM {qualified_table_name} WHERE {field} < 0"
)

if filters:
negative_query += f" AND {filters}"

total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name}"

if filters:
total_count_query += f" WHERE {filters}"

if operation == "percent":
query = f"SELECT (CAST(({negative_query}) AS float) / CAST(({total_count_query}) AS float)) * 100"
else:
query = negative_query

result = self.fetchone(query)[0]
return round(result, 2) if operation == "percent" else result
4 changes: 4 additions & 0 deletions dcs_core/core/validation/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
)
from dcs_core.core.validation.numeric_validation import ( # noqa F401 this is used in globals
AvgValidation,
CountNegativeValidation,
CountZeroValidation,
MaxValidation,
MinValidation,
Expand All @@ -40,6 +41,7 @@
Percentile60Validation,
Percentile80Validation,
Percentile90Validation,
PercentNegativeValidation,
PercentZeroValidation,
StdDevValidation,
SumValidation,
Expand Down Expand Up @@ -161,6 +163,8 @@ class ValidationManager:
ValidationFunction.PERCENTILE_90.value: "Percentile90Validation",
ValidationFunction.COUNT_ZERO.value: "CountZeroValidation",
ValidationFunction.PERCENT_ZERO.value: "PercentZeroValidation",
ValidationFunction.COUNT_NEGATIVE.value: "CountNegativeValidation",
ValidationFunction.PERCENT_NEGATIVE.value: "PercentNegativeValidation",
}

def __init__(
Expand Down
28 changes: 28 additions & 0 deletions dcs_core/core/validation/numeric_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,3 +216,31 @@ def _generate_metric_value(self, **kwargs) -> float:
)
else:
raise ValueError("Unsupported data source type for PercentZeroValidation")


class CountNegativeValidation(Validation):
def _generate_metric_value(self, **kwargs) -> int:
if isinstance(self.data_source, SQLDataSource):
return self.data_source.query_negative_metric(
table=self.dataset_name,
field=self.field_name,
operation="count",
filters=self.where_filter if self.where_filter is not None else None,
)
else:
raise ValueError("Unsupported data source type for CountNegativeValidation")


class PercentNegativeValidation(Validation):
def _generate_metric_value(self, **kwargs) -> float:
if isinstance(self.data_source, SQLDataSource):
return self.data_source.query_negative_metric(
table=self.dataset_name,
field=self.field_name,
operation="percent",
filters=self.where_filter if self.where_filter is not None else None,
)
else:
raise ValueError(
"Unsupported data source type for PercentNegativeValidation"
)
29 changes: 29 additions & 0 deletions docs/validations/validity.md
Original file line number Diff line number Diff line change
Expand Up @@ -475,4 +475,33 @@ validations for product_db.products:
- price zero percent:
on: percent_zero(price)
threshold: "< 10"
# **Numeric Negative Value Validations**
The Numeric Negative Value Validations detect negative values in numeric fields within a dataset and ensure that they do not exceed or fall below a specified threshold.
## **COUNT_NEGATIVE**
This validation counts the number of negative values present in a given numeric field.
**Example**
```yaml
validations for product_db.products:
- negative value count should be less than 2:
on: count_negative(price)
threshold: "< 2"
```
## **PERCENT_NEGATIVE**
This validation calculates the percentage of negative values in a numeric field, relative to the total number of records.
**Example**
```yaml
validations for product_db.products:
- negative value percentage should be less than 40%:
on: percent_negative(price)
threshold: "< 40"
```
9 changes: 9 additions & 0 deletions examples/configurations/postgres/example_postgres_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,15 @@ validations for iris_pgsql.dcs_iris:
threshold: "< 10"


# **Negative Value Validations**
- price negative value count:
on: count_negative(price)
threshold: "< 2"

- price negative value percentage:
on: percent_negative(price)
threshold: "< 40"

# Uniqueness Metrics
- species duplicate count:
on: count_duplicate(species)
Expand Down
32 changes: 32 additions & 0 deletions tests/core/configuration/test_configuration_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -993,3 +993,35 @@ def test_should_parse_percent_zero_validation():
.threshold.lt
== 10
)


def test_should_parse_count_negative_validation():
yaml_string = """
validations for product_db.products:
- count_negative for price should be less than 2:
on: count_negative(price)
threshold: "< 2"
"""
configuration = load_configuration_from_yaml_str(yaml_string)
assert (
configuration.validations["product_db.products"]
.validations["count_negative for price should be less than 2"]
.get_validation_function
== ValidationFunction.COUNT_NEGATIVE
)


def test_should_parse_percent_negative_validation():
yaml_string = """
validations for product_db.products:
- percent_negative for price should be less than 40%:
on: percent_negative(price)
threshold: "< 40"
"""
configuration = load_configuration_from_yaml_str(yaml_string)
assert (
configuration.validations["product_db.products"]
.validations["percent_negative for price should be less than 40%"]
.get_validation_function
== ValidationFunction.PERCENT_NEGATIVE
)
33 changes: 25 additions & 8 deletions tests/integration/datasource/test_sql_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ def setup_tables(
figi VARCHAR(12),
isin VARCHAR(12),
perm_id VARCHAR(50),
salary INTEGER
salary INTEGER,
price FLOAT
)
"""
)
Expand All @@ -143,27 +144,27 @@ def setup_tables(
('thor', '{(utc_now - datetime.timedelta(days=10)).strftime("%Y-%m-%d")}',
1500, NULL, 'thor hammer', 'e7194aaa-5516-4362-a5ff-6ff971976bec',
'123-456-7890', 'jane.doe@domain', 'C2', 'ABCDE', 40.0678, -7555555554.0060,'856-45-6789','0067340',
'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789', 0), -- invalid email -- invalid usa_state_code -- invalid usa_zip_code -- invalid cusip -- invalid perm_id
'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789', 0, 100.0), -- invalid email -- invalid usa_state_code -- invalid usa_zip_code -- invalid cusip -- invalid perm_id
('captain america', '{(utc_now - datetime.timedelta(days=3)).strftime("%Y-%m-%d")}',
90, 80, 'shield', 'e7194aaa-5516-4362-a5ff-6ff971976b', '(123) 456-7890',
'john.doe@.com ', 'NY', '12-345', 34.0522, -118.2437,'000-12-3456', 'B01HL06',
'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789', 1000), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn
'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789', 1000, -50.0), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn
('iron man', '{(utc_now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")}',
50, 70, 'suit', '1739c676-6108-4dd2-8984-2459df744936', '123 456 7890',
'contact@company..org', 'XY', '85001', 37.7749, -122.4194,'859-99-9999','4155586',
'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678', 0), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id
'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678', 0, -150.0), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id
('hawk eye', '{(utc_now - datetime.timedelta(days=5)).strftime("%Y-%m-%d")}',
40, 60, 'bow', '1739c676-6108-4dd2-8984-2459df746', '+1 123-456-7890',
'user@@example.com', 'TX', '30301', 51.1657, 10.4515,'123-45-67890','12345',
'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291', 90), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol
'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291', 90, 50.0), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol
('clark kent', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}',
35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8', '09123.456.7890',
'contact@company.org', 'ZZ', '123456', 51.5074, -0.1278,'666-45-6789','34A56B7',
'6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X', 0), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id
'6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X', 0, -25.0), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id
('black widow', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}',
35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8032c73', '+1 (123) 456-7890',
'jane_smith123@domain.co.uk', 'FL', '90210', 483.8566, 2.3522,'001-01-0001','456VGHY',
'0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890', 70) -- invalid isin -- invalid sedol
'0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890', 70, 30.0) -- invalid isin -- invalid sedol
"""

postgresql_connection.execute(text(insert_query))
Expand Down Expand Up @@ -611,7 +612,7 @@ def test_should_return_90th_percentile_age(
percentile_90 = postgres_datasource.query_get_percentile(
table=self.TABLE_NAME, field="age", percentile=0.9
)
assert percentile_90 == 1500
assert percentile_90 == 1500 # Expected 90th percentile value.

def test_should_return_count_zero(self, postgres_datasource: PostgresDataSource):
count_zero = postgres_datasource.query_zero_metric(
Expand All @@ -624,3 +625,19 @@ def test_should_return_percent_zero(self, postgres_datasource: PostgresDataSourc
table=self.TABLE_NAME, field="salary", operation="percent"
)
assert round(percent_zero, 2) == 50.0

def test_should_return_count_negative(
self, postgres_datasource: PostgresDataSource
):
count_negative = postgres_datasource.query_negative_metric(
table=self.TABLE_NAME, field="price", operation="count"
)
assert count_negative == 3

def test_should_return_percent_negative(
self, postgres_datasource: PostgresDataSource
):
percent_negative = postgres_datasource.query_negative_metric(
table=self.TABLE_NAME, field="price", operation="percent"
)
assert round(percent_negative, 2) == 50.0

0 comments on commit 4665f05

Please sign in to comment.