From cb0f3976a5567faf51688b96493522cf2891d62d Mon Sep 17 00:00:00 2001 From: aman-waterdip <162797207+aman-waterdip@users.noreply.github.com> Date: Wed, 4 Sep 2024 15:11:45 +0530 Subject: [PATCH] feat: negative value numeric validation function --- dcs_core/core/common/models/validation.py | 2 + dcs_core/core/datasource/sql_datasource.py | 25 +++++++++++ dcs_core/core/validation/manager.py | 4 ++ .../core/validation/numeric_validation.py | 28 +++++++++++++ docs/validations/validity.md | 32 ++++++++++++++- .../postgres/example_postgres_config.yaml | 9 ++++ .../configuration/test_configuration_v1.py | 32 +++++++++++++++ .../datasource/test_sql_datasource.py | 41 +++++++++++++------ 8 files changed, 160 insertions(+), 13 deletions(-) diff --git a/dcs_core/core/common/models/validation.py b/dcs_core/core/common/models/validation.py index e4587223..cf9bd70a 100644 --- a/dcs_core/core/common/models/validation.py +++ b/dcs_core/core/common/models/validation.py @@ -80,6 +80,8 @@ class ValidationFunction(str, Enum): PERCENTILE_60 = "percentile_60" PERCENTILE_80 = "percentile_80" PERCENTILE_90 = "percentile_90" + COUNT_NEGATIVE = "count_negative" + PERCENT_NEGATIVE = "percent_negative" # Reliability validations 3 COUNT_ROWS = "count_rows" diff --git a/dcs_core/core/datasource/sql_datasource.py b/dcs_core/core/datasource/sql_datasource.py index 1020192d..3862ea3b 100644 --- a/dcs_core/core/datasource/sql_datasource.py +++ b/dcs_core/core/datasource/sql_datasource.py @@ -622,3 +622,28 @@ def query_get_percentile( if filters: query += f" WHERE {filters}" return round(self.fetchone(query)[0], 2) + + def query_negative_metric( + self, table: str, field: str, operation: str, filters: str = None + ) -> Union[int, float]: + qualified_table_name = self.qualified_table_name(table) + + negative_query = ( + f"SELECT COUNT(*) FROM {qualified_table_name} WHERE {field} < 0" + ) + + if filters: + negative_query += f" AND {filters}" + + total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name}" + + if filters: + total_count_query += f" WHERE {filters}" + + if operation == "percent": + query = f"SELECT (CAST(({negative_query}) AS float) / CAST(({total_count_query}) AS float)) * 100" + else: + query = negative_query + + result = self.fetchone(query)[0] + return round(result, 2) if operation == "percent" else result diff --git a/dcs_core/core/validation/manager.py b/dcs_core/core/validation/manager.py index 2da766fa..36d4430f 100644 --- a/dcs_core/core/validation/manager.py +++ b/dcs_core/core/validation/manager.py @@ -32,6 +32,7 @@ ) from dcs_core.core.validation.numeric_validation import ( # noqa F401 this is used in globals AvgValidation, + CountNegativeValidation, MaxValidation, MinValidation, Percentile20Validation, @@ -39,6 +40,7 @@ Percentile60Validation, Percentile80Validation, Percentile90Validation, + PercentNegativeValidation, StdDevValidation, SumValidation, VarianceValidation, @@ -157,6 +159,8 @@ class ValidationManager: ValidationFunction.PERCENTILE_60.value: "Percentile60Validation", ValidationFunction.PERCENTILE_80.value: "Percentile80Validation", ValidationFunction.PERCENTILE_90.value: "Percentile90Validation", + ValidationFunction.COUNT_NEGATIVE.value: "CountNegativeValidation", + ValidationFunction.PERCENT_NEGATIVE.value: "PercentNegativeValidation", } def __init__( diff --git a/dcs_core/core/validation/numeric_validation.py b/dcs_core/core/validation/numeric_validation.py index 14617292..b28b4fb8 100644 --- a/dcs_core/core/validation/numeric_validation.py +++ b/dcs_core/core/validation/numeric_validation.py @@ -190,3 +190,31 @@ def _generate_metric_value(self, **kwargs) -> float: ) else: raise ValueError("Unsupported data source type for Percentile90Validation") + + +class CountNegativeValidation(Validation): + def _generate_metric_value(self, **kwargs) -> int: + if isinstance(self.data_source, SQLDataSource): + return self.data_source.query_negative_metric( + table=self.dataset_name, + field=self.field_name, + operation="count", + filters=self.where_filter if self.where_filter is not None else None, + ) + else: + raise ValueError("Unsupported data source type for CountNegativeValidation") + + +class PercentNegativeValidation(Validation): + def _generate_metric_value(self, **kwargs) -> float: + if isinstance(self.data_source, SQLDataSource): + return self.data_source.query_negative_metric( + table=self.dataset_name, + field=self.field_name, + operation="percent", + filters=self.where_filter if self.where_filter is not None else None, + ) + else: + raise ValueError( + "Unsupported data source type for PercentNegativeValidation" + ) diff --git a/docs/validations/validity.md b/docs/validations/validity.md index 05f70575..39061cbe 100644 --- a/docs/validations/validity.md +++ b/docs/validations/validity.md @@ -445,4 +445,34 @@ The percent permid validation checks the percentage of valid permid in a dataset validations for product_db.products: - percent_permid_of_user: on: percent_permid(perm_id) -``` \ No newline at end of file +``` + +# **Numeric Negative Value Validations** + +The Numeric Negative Value Validations detect negative values in numeric fields within a dataset and ensure that they do not exceed or fall below a specified threshold. + +## **COUNT_NEGATIVE** + +This validation counts the number of negative values present in a given numeric field. + +**Example** + +```yaml +validations for product_db.products: + - negative value count should be less than 2: + on: count_negative(price) + threshold: "< 2" +``` + +## **PERCENT_NEGATIVE** + +This validation calculates the percentage of negative values in a numeric field, relative to the total number of records. + +**Example** + +```yaml +validations for product_db.products: + - negative value percentage should be less than 40%: + on: percent_negative(price) + threshold: "< 40" +``` diff --git a/examples/configurations/postgres/example_postgres_config.yaml b/examples/configurations/postgres/example_postgres_config.yaml index 84b96d6b..34d63a50 100644 --- a/examples/configurations/postgres/example_postgres_config.yaml +++ b/examples/configurations/postgres/example_postgres_config.yaml @@ -19,6 +19,15 @@ validations for iris_pgsql.dcs_iris: on: stddev(sepal_length) threshold: "< 0.5" + # **Negative Value Validations** + - price negative value count: + on: count_negative(price) + threshold: "< 2" + + - price negative value percentage: + on: percent_negative(price) + threshold: "< 40" + # Uniqueness Metrics - species duplicate count: on: count_duplicate(species) diff --git a/tests/core/configuration/test_configuration_v1.py b/tests/core/configuration/test_configuration_v1.py index 5f31e82f..e48177b1 100644 --- a/tests/core/configuration/test_configuration_v1.py +++ b/tests/core/configuration/test_configuration_v1.py @@ -949,3 +949,35 @@ def test_should_parse_90th_percentile_validation(): .get_validation_function == ValidationFunction.PERCENTILE_90 ) + + +def test_should_parse_count_negative_validation(): + yaml_string = """ + validations for product_db.products: + - count_negative for price should be less than 2: + on: count_negative(price) + threshold: "< 2" + """ + configuration = load_configuration_from_yaml_str(yaml_string) + assert ( + configuration.validations["product_db.products"] + .validations["count_negative for price should be less than 2"] + .get_validation_function + == ValidationFunction.COUNT_NEGATIVE + ) + + +def test_should_parse_percent_negative_validation(): + yaml_string = """ + validations for product_db.products: + - percent_negative for price should be less than 40%: + on: percent_negative(price) + threshold: "< 40" + """ + configuration = load_configuration_from_yaml_str(yaml_string) + assert ( + configuration.validations["product_db.products"] + .validations["percent_negative for price should be less than 40%"] + .get_validation_function + == ValidationFunction.PERCENT_NEGATIVE + ) diff --git a/tests/integration/datasource/test_sql_datasource.py b/tests/integration/datasource/test_sql_datasource.py index c196cf9d..02ffb9fc 100644 --- a/tests/integration/datasource/test_sql_datasource.py +++ b/tests/integration/datasource/test_sql_datasource.py @@ -130,7 +130,8 @@ def setup_tables( cusip VARCHAR(9), figi VARCHAR(12), isin VARCHAR(12), - perm_id VARCHAR(50) + perm_id VARCHAR(50), + price FLOAT ) """ ) @@ -142,27 +143,27 @@ def setup_tables( ('thor', '{(utc_now - datetime.timedelta(days=10)).strftime("%Y-%m-%d")}', 1500, NULL, 'thor hammer', 'e7194aaa-5516-4362-a5ff-6ff971976bec', '123-456-7890', 'jane.doe@domain', 'C2', 'ABCDE', 40.0678, -7555555554.0060,'856-45-6789','0067340', - 'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789'), -- invalid email -- invalid usa_state_code -- invalid usa_zip_code -- invalid cusip -- invalid perm_id + 'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789', 100.0), -- invalid email -- invalid usa_state_code -- invalid usa_zip_code -- invalid cusip -- invalid perm_id ('captain america', '{(utc_now - datetime.timedelta(days=3)).strftime("%Y-%m-%d")}', 90, 80, 'shield', 'e7194aaa-5516-4362-a5ff-6ff971976b', '(123) 456-7890', 'john.doe@.com ', 'NY', '12-345', 34.0522, -118.2437,'000-12-3456', 'B01HL06', - 'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789'), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn + 'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789', -50.0), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn ('iron man', '{(utc_now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")}', 50, 70, 'suit', '1739c676-6108-4dd2-8984-2459df744936', '123 456 7890', 'contact@company..org', 'XY', '85001', 37.7749, -122.4194,'859-99-9999','4155586', - 'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678'), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id + 'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678', -150.0), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id ('hawk eye', '{(utc_now - datetime.timedelta(days=5)).strftime("%Y-%m-%d")}', 40, 60, 'bow', '1739c676-6108-4dd2-8984-2459df746', '+1 123-456-7890', 'user@@example.com', 'TX', '30301', 51.1657, 10.4515,'123-45-67890','12345', - 'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291'), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol + 'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291', 50.0), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol ('clark kent', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}', 35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8', '09123.456.7890', 'contact@company.org', 'ZZ', '123456', 51.5074, -0.1278,'666-45-6789','34A56B7', - '6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X'), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id + '6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X', -25.0), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id ('black widow', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}', 35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8032c73', '+1 (123) 456-7890', 'jane_smith123@domain.co.uk', 'FL', '90210', 483.8566, 2.3522,'001-01-0001','456VGHY', - '0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890') -- invalid isin -- invalid sedol + '0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890', 30.0) -- invalid isin -- invalid sedol """ postgresql_connection.execute(text(insert_query)) @@ -578,7 +579,7 @@ def test_should_return_20th_percentile_age( percentile_20 = postgres_datasource.query_get_percentile( table=self.TABLE_NAME, field="age", percentile=0.2 ) - assert percentile_20 == 35 # Expected 20th percentile value. + assert percentile_20 == 35 def test_should_return_40th_percentile_age( self, postgres_datasource: PostgresDataSource @@ -586,7 +587,7 @@ def test_should_return_40th_percentile_age( percentile_40 = postgres_datasource.query_get_percentile( table=self.TABLE_NAME, field="age", percentile=0.4 ) - assert percentile_40 == 40 # Expected 40th percentile value. + assert percentile_40 == 40 def test_should_return_60th_percentile_age( self, postgres_datasource: PostgresDataSource @@ -594,7 +595,7 @@ def test_should_return_60th_percentile_age( percentile_60 = postgres_datasource.query_get_percentile( table=self.TABLE_NAME, field="age", percentile=0.6 ) - assert percentile_60 == 50 # Expected 60th percentile value. + assert percentile_60 == 50 def test_should_return_80th_percentile_age( self, postgres_datasource: PostgresDataSource @@ -602,7 +603,7 @@ def test_should_return_80th_percentile_age( percentile_80 = postgres_datasource.query_get_percentile( table=self.TABLE_NAME, field="age", percentile=0.8 ) - assert percentile_80 == 90 # Expected 80th percentile value. + assert percentile_80 == 90 def test_should_return_90th_percentile_age( self, postgres_datasource: PostgresDataSource @@ -610,4 +611,20 @@ def test_should_return_90th_percentile_age( percentile_90 = postgres_datasource.query_get_percentile( table=self.TABLE_NAME, field="age", percentile=0.9 ) - assert percentile_90 == 1500 # Expected 90th percentile value. + assert percentile_90 == 1500 + + def test_should_return_count_negative( + self, postgres_datasource: PostgresDataSource + ): + count_negative = postgres_datasource.query_negative_metric( + table=self.TABLE_NAME, field="price", operation="count" + ) + assert count_negative == 3 + + def test_should_return_percent_negative( + self, postgres_datasource: PostgresDataSource + ): + percent_negative = postgres_datasource.query_negative_metric( + table=self.TABLE_NAME, field="price", operation="percent" + ) + assert round(percent_negative, 2) == 50.0