datachecks · subhankarb · Sep 5, 2024 · Sep 4, 2024
diff --git a/dcs_core/core/common/models/validation.py b/dcs_core/core/common/models/validation.py
@@ -80,6 +80,8 @@ class ValidationFunction(str, Enum):
     PERCENTILE_60 = "percentile_60"
     PERCENTILE_80 = "percentile_80"
     PERCENTILE_90 = "percentile_90"
+    COUNT_ZERO = "count_zero"
+    PERCENT_ZERO = "percent_zero"
 
     # Reliability validations 3
     COUNT_ROWS = "count_rows"

diff --git a/dcs_core/core/datasource/sql_datasource.py b/dcs_core/core/datasource/sql_datasource.py
@@ -622,3 +622,30 @@ def query_get_percentile(
         if filters:
             query += f" WHERE {filters}"
         return round(self.fetchone(query)[0], 2)
+
+    def query_zero_metric(
+        self, table: str, field: str, operation: str, filters: str = None
+    ) -> Union[int, float]:
+        qualified_table_name = self.qualified_table_name(table)
+
+        zero_query = f"SELECT COUNT(*) FROM {qualified_table_name} WHERE {field} = 0"
+
+        if filters:
+            zero_query += f" AND {filters}"
+
+        if operation == "percent":
+            total_count_query = f"SELECT COUNT(*) FROM {qualified_table_name}"
+            if filters:
+                total_count_query += f" WHERE {filters}"
+
+            zero_count = self.fetchone(zero_query)[0]
+            total_count = self.fetchone(total_count_query)[0]
+
+            if total_count == 0:
+                return 0.0
+
+            result = (zero_count / total_count) * 100
+            return round(result, 2)
+        else:
+            result = self.fetchone(zero_query)[0]
+            return result
diff --git a/dcs_core/core/validation/manager.py b/dcs_core/core/validation/manager.py
@@ -32,13 +32,15 @@
 )
 from dcs_core.core.validation.numeric_validation import (  # noqa F401 this is used in globals
     AvgValidation,
+    CountZeroValidation,
     MaxValidation,
     MinValidation,
     Percentile20Validation,
     Percentile40Validation,
     Percentile60Validation,
     Percentile80Validation,
     Percentile90Validation,
+    PercentZeroValidation,
     StdDevValidation,
     SumValidation,
     VarianceValidation,
@@ -157,6 +159,8 @@ class ValidationManager:
         ValidationFunction.PERCENTILE_60.value: "Percentile60Validation",
         ValidationFunction.PERCENTILE_80.value: "Percentile80Validation",
         ValidationFunction.PERCENTILE_90.value: "Percentile90Validation",
+        ValidationFunction.COUNT_ZERO.value: "CountZeroValidation",
+        ValidationFunction.PERCENT_ZERO.value: "PercentZeroValidation",
     }
 
     def __init__(

diff --git a/dcs_core/core/validation/numeric_validation.py b/dcs_core/core/validation/numeric_validation.py
@@ -190,3 +190,29 @@ def _generate_metric_value(self, **kwargs) -> float:
             )
         else:
             raise ValueError("Unsupported data source type for Percentile90Validation")
+
+
+class CountZeroValidation(Validation):
+    def _generate_metric_value(self, **kwargs) -> int:
+        if isinstance(self.data_source, SQLDataSource):
+            return self.data_source.query_zero_metric(
+                table=self.dataset_name,
+                field=self.field_name,
+                operation="count",
+                filters=self.where_filter if self.where_filter is not None else None,
+            )
+        else:
+            raise ValueError("Unsupported data source type for CountZeroValidation")
+
+
+class PercentZeroValidation(Validation):
+    def _generate_metric_value(self, **kwargs) -> float:
+        if isinstance(self.data_source, SQLDataSource):
+            return self.data_source.query_zero_metric(
+                table=self.dataset_name,
+                field=self.field_name,
+                operation="percent",
+                filters=self.where_filter if self.where_filter is not None else None,
+            )
+        else:
+            raise ValueError("Unsupported data source type for PercentZeroValidation")
diff --git a/docs/validations/validity.md b/docs/validations/validity.md
@@ -445,4 +445,34 @@ The percent permid validation checks the percentage of valid permid in a dataset
 validations for product_db.products:
   - percent_permid_of_user:
       on: percent_permid(perm_id)
-```
+```
+
+## Zero Value Validations
+
+Zero value validations are used to identify and validate fields that contain zero values, which are important for datasets where zero values might have specific implications, such as indicating missing or invalid data, or representing real-world conditions.
+
+## `COUNT_ZERO`
+
+`COUNT_ZERO` is used to count the number of rows where the specified field contains a zero value. It can be useful for detecting cases where zero might represent missing data or special conditions.
+
+**Example:**
+
+```yaml
+validations for product_db.products:
+  - price zero count:
+      on: count_zero(price)
+      threshold: "< 52"
+```
+
+## `PERCENT_ZERO`
+
+`PERCENT_ZERO` is used to calculate the percentage of rows where the specified field contains a zero value. This helps assess the proportion of zero values in a column, allowing the user to enforce percentage-based thresholds for data quality.
+
+**Example:**
+
+```yaml
+validations for product_db.products:
+  - price zero percent:
+      on: percent_zero(price)
+      threshold: "< 10"
+```
diff --git a/examples/configurations/postgres/example_postgres_config.yaml b/examples/configurations/postgres/example_postgres_config.yaml
@@ -18,6 +18,35 @@ validations for iris_pgsql.dcs_iris:
   - sepal length stddev size:
       on: stddev(sepal_length)
       threshold: "< 0.5"
+  # Percentile Validations
+  - sepal length 20th percentile:
+      on: percentile_20(sepal_length)
+      threshold: "> 10"
+
+  - sepal length 40th percentile:
+      on: percentile_40(sepal_length)
+      threshold: "> 20"
+
+  - sepal length 60th percentile:
+      on: percentile_60(sepal_length)
+      threshold: "> 30"
+
+  - sepal length 80th percentile:
+      on: percentile_80(sepal_length)
+      threshold: "> 40"
+
+  - sepal length 90th percentile:
+      on: percentile_90(sepal_length)
+      threshold: "> 50"
+
+    # Zero Numeric Validations
+  - sepal length zero count:
+      on: count_zero(sepal_length)
+      threshold: "< 5"
+  - sepal length zero percent:
+      on: percent_zero(sepal_length)
+      threshold: "< 10"
+
 
   # Uniqueness Metrics
   - species duplicate count:
@@ -138,23 +167,3 @@ validations for product_db.products:
       on: percent_longitude(longitude)
       threshold: "> 80"
 
-  # Percentile Validations
-  - sepal length 20th percentile:
-      on: percentile_20(sepal_length)
-      threshold: "> 10"
-
-  - sepal length 40th percentile:
-      on: percentile_40(sepal_length)
-      threshold: "> 20"
-
-  - sepal length 60th percentile:
-      on: percentile_60(sepal_length)
-      threshold: "> 30"
-
-  - sepal length 80th percentile:
-      on: percentile_80(sepal_length)
-      threshold: "> 40"
-
-  - sepal length 90th percentile:
-      on: percentile_90(sepal_length)
-      threshold: "> 50"
diff --git a/tests/core/configuration/test_configuration_v1.py b/tests/core/configuration/test_configuration_v1.py
@@ -949,3 +949,47 @@ def test_should_parse_90th_percentile_validation():
         .get_validation_function
         == ValidationFunction.PERCENTILE_90
     )
+
+
+def test_should_parse_count_zero_validation():
+    yaml_string = """
+    validations for product_db.products:
+      - test:
+          on: count_zero(price)
+          threshold: "< 52"
+    """
+    configuration = load_configuration_from_yaml_str(yaml_string)
+    assert (
+        configuration.validations["product_db.products"]
+        .validations["test"]
+        .get_validation_function
+        == ValidationFunction.COUNT_ZERO
+    )
+    assert (
+        configuration.validations["product_db.products"]
+        .validations["test"]
+        .threshold.lt
+        == 52
+    )
+
+
+def test_should_parse_percent_zero_validation():
+    yaml_string = """
+    validations for product_db.products:
+      - test:
+          on: percent_zero(price)
+          threshold: "< 10"
+    """
+    configuration = load_configuration_from_yaml_str(yaml_string)
+    assert (
+        configuration.validations["product_db.products"]
+        .validations["test"]
+        .get_validation_function
+        == ValidationFunction.PERCENT_ZERO
+    )
+    assert (
+        configuration.validations["product_db.products"]
+        .validations["test"]
+        .threshold.lt
+        == 10
+    )
diff --git a/tests/integration/datasource/test_sql_datasource.py b/tests/integration/datasource/test_sql_datasource.py
@@ -130,7 +130,8 @@ def setup_tables(
                             cusip VARCHAR(9),
                             figi VARCHAR(12),
                             isin VARCHAR(12),
-                            perm_id VARCHAR(50)
+                            perm_id VARCHAR(50),
+                            salary INTEGER
                         )
                     """
                 )
@@ -142,27 +143,27 @@ def setup_tables(
                 ('thor', '{(utc_now - datetime.timedelta(days=10)).strftime("%Y-%m-%d")}',
                     1500, NULL, 'thor hammer', 'e7194aaa-5516-4362-a5ff-6ff971976bec',
                     '123-456-7890', 'jane.doe@domain', 'C2', 'ABCDE', 40.0678, -7555555554.0060,'856-45-6789','0067340',
-                    'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789'), -- invalid email -- invalid usa_state_code  -- invalid usa_zip_code -- invalid cusip -- invalid perm_id
+                    'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789', 0), -- invalid email -- invalid usa_state_code  -- invalid usa_zip_code -- invalid cusip -- invalid perm_id
                 ('captain america', '{(utc_now - datetime.timedelta(days=3)).strftime("%Y-%m-%d")}',
                     90, 80, 'shield', 'e7194aaa-5516-4362-a5ff-6ff971976b', '(123) 456-7890',
                     'john.doe@.com ', 'NY', '12-345', 34.0522, -118.2437,'000-12-3456', 'B01HL06',
-                    'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789'), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn
+                    'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789', 1000), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn
                 ('iron man', '{(utc_now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")}',
                     50, 70, 'suit', '1739c676-6108-4dd2-8984-2459df744936', '123 456 7890',
                     'contact@company..org', 'XY', '85001', 37.7749, -122.4194,'859-99-9999','4155586',
-                    'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678'), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id
+                    'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678', 0), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id
                 ('hawk eye', '{(utc_now - datetime.timedelta(days=5)).strftime("%Y-%m-%d")}',
                     40, 60, 'bow', '1739c676-6108-4dd2-8984-2459df746', '+1 123-456-7890',
                     'user@@example.com', 'TX', '30301', 51.1657, 10.4515,'123-45-67890','12345',
-                    'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291'), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol
+                    'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291', 90), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol
                 ('clark kent', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}',
                     35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8', '09123.456.7890',
                     'contact@company.org', 'ZZ', '123456', 51.5074, -0.1278,'666-45-6789','34A56B7',
-                    '6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X'), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id
+                    '6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X', 0), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id
                 ('black widow', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}',
                     35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8032c73', '+1 (123) 456-7890',
                     'jane_smith123@domain.co.uk', 'FL', '90210', 483.8566, 2.3522,'001-01-0001','456VGHY',
-                    '0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890') -- invalid isin -- invalid sedol
+                    '0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890', 70) -- invalid isin -- invalid sedol
             """
 
             postgresql_connection.execute(text(insert_query))
@@ -578,36 +579,48 @@ def test_should_return_20th_percentile_age(
         percentile_20 = postgres_datasource.query_get_percentile(
             table=self.TABLE_NAME, field="age", percentile=0.2
         )
-        assert percentile_20 == 35  # Expected 20th percentile value.
+        assert percentile_20 == 35
 
     def test_should_return_40th_percentile_age(
         self, postgres_datasource: PostgresDataSource
     ):
         percentile_40 = postgres_datasource.query_get_percentile(
             table=self.TABLE_NAME, field="age", percentile=0.4
         )
-        assert percentile_40 == 40  # Expected 40th percentile value.
+        assert percentile_40 == 40
 
     def test_should_return_60th_percentile_age(
         self, postgres_datasource: PostgresDataSource
     ):
         percentile_60 = postgres_datasource.query_get_percentile(
             table=self.TABLE_NAME, field="age", percentile=0.6
         )
-        assert percentile_60 == 50  # Expected 60th percentile value.
+        assert percentile_60 == 50
 
     def test_should_return_80th_percentile_age(
         self, postgres_datasource: PostgresDataSource
     ):
         percentile_80 = postgres_datasource.query_get_percentile(
             table=self.TABLE_NAME, field="age", percentile=0.8
         )
-        assert percentile_80 == 90  # Expected 80th percentile value.
+        assert percentile_80 == 90
 
     def test_should_return_90th_percentile_age(
         self, postgres_datasource: PostgresDataSource
     ):
         percentile_90 = postgres_datasource.query_get_percentile(
             table=self.TABLE_NAME, field="age", percentile=0.9
         )
-        assert percentile_90 == 1500  # Expected 90th percentile value.
+        assert percentile_90 == 1500
+
+    def test_should_return_count_zero(self, postgres_datasource: PostgresDataSource):
+        count_zero = postgres_datasource.query_zero_metric(
+            table=self.TABLE_NAME, field="salary", operation="count"
+        )
+        assert count_zero == 3
+
+    def test_should_return_percent_zero(self, postgres_datasource: PostgresDataSource):
+        percent_zero = postgres_datasource.query_zero_metric(
+            table=self.TABLE_NAME, field="salary", operation="percent"
+        )
+        assert round(percent_zero, 2) == 50.0