Skip to content

Commit

Permalink
add: Null keyword completeness validation
Browse files Browse the repository at this point in the history
add: Null keyword completeness validation

add: Null keyword completeness validation

add: Null keyword completeness validation

add: Null keyword completeness validation

add: yaml file
  • Loading branch information
Ksaurav3380 committed Sep 7, 2024
1 parent cd8448c commit 93bbdcb
Show file tree
Hide file tree
Showing 8 changed files with 130 additions and 9 deletions.
2 changes: 2 additions & 0 deletions dcs_core/core/common/models/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ class ValidationFunction(str, Enum):
PERCENT_EMPTY_STRING = "percent_empty_string"
COUNT_NAN = "count_nan"
PERCENT_NAN = "percent_nan"
COUNT_NULL_KEYWORD = "count_null_keyword"
PERCENT_NULL_KEYWORD = "percent_null_keyboard"

# Custom SQL
CUSTOM_SQL = "custom_sql"
Expand Down
4 changes: 2 additions & 2 deletions dcs_core/core/datasource/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,15 @@ def _create_data_source(
data_source_name = data_source_config.name
data_source_type = data_source_config.type
if data_source_type == "spark_df":
from datachecks.integrations.databases.spark_df import SparkDFDataSource
from dcs_core.integrations.databases.spark_df import SparkDFDataSource

return SparkDFDataSource(
data_source_name,
{"spark_session": data_source_config.connection_config.spark_session},
)
try:
module_name = (
f"datachecks.integrations.databases.{data_source_config.type.value}"
f"dcs_core.integrations.databases.{data_source_config.type.value}"
)
module = importlib.import_module(module_name)
data_source_class = self.DATA_SOURCE_CLASS_NAME_MAPPER[
Expand Down
25 changes: 25 additions & 0 deletions dcs_core/core/datasource/sql_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,3 +674,28 @@ def query_negative_metric(

result = self.fetchone(query)[0]
return round(result, 2) if operation == "percent" else result

def query_get_null_keyword_count(
self, table: str, field: str, operation: str, filters: str = None
) -> Union[int, float]:
"""
Get the count of NULL-like values (specific keywords) in the specified column.
:param table: table name
:param field: column name
:param filters: filter condition
:return: count of NULL-like keyword values
"""
qualified_table_name = self.qualified_table_name(table)

query = f""" SELECT SUM(CASE WHEN LOWER({field}) IN ('nothing', 'nil', 'null', 'none', 'n/a') THEN 1 ELSE 0 END) AS null_count,COUNT(*) AS total_count
FROM {qualified_table_name}"""

if filters:
query += f" AND {filters}"

result = self.fetchone(query)

if operation == "percent":
return round((result[0] / result[1]) * 100, 2) if result[1] > 0 else 0

return result[0] if result else 0
26 changes: 26 additions & 0 deletions dcs_core/core/validation/completeness_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,29 @@ def _generate_metric_value(self, **kwargs) -> Union[float, int]:
)
else:
raise ValueError("Invalid data source type")


class CountNullKeywordValidation(Validation):
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
if isinstance(self.data_source, SQLDataSource):
return self.data_source.query_get_null_keyword_count(
table=self.dataset_name,
field=self.field_name,
operation="count",
filters=self.where_filter if self.where_filter is not None else None,
)
else:
raise ValueError("Invalid data source type")


class PercentageNullKeywordValidation(Validation):
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
if isinstance(self.data_source, SQLDataSource):
return self.data_source.query_get_null_keyword_count(
table=self.dataset_name,
field=self.field_name,
operation="percent",
filters=self.where_filter if self.where_filter is not None else None,
)
else:
raise ValueError("Invalid data source type")
4 changes: 4 additions & 0 deletions dcs_core/core/validation/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@
from dcs_core.core.validation.base import Validation
from dcs_core.core.validation.completeness_validation import ( # noqa F401 this is used in globals
CountEmptyStringValidation,
CountNullKeywordValidation,
CountNullValidation,
PercentageEmptyStringValidation,
PercentageNullKeywordValidation,
PercentageNullValidation,
)
from dcs_core.core.validation.custom_query_validation import ( # noqa F401 this is used in globals
Expand Down Expand Up @@ -165,6 +167,8 @@ class ValidationManager:
ValidationFunction.PERCENT_ZERO.value: "PercentZeroValidation",
ValidationFunction.COUNT_NEGATIVE.value: "CountNegativeValidation",
ValidationFunction.PERCENT_NEGATIVE.value: "PercentNegativeValidation",
ValidationFunction.COUNT_NULL_KEYWORD.value: "CountNullKeywordValidation",
ValidationFunction.PERCENT_NULL_KEYWORD.value: "PercentageNullKeywordValidation",
}

def __init__(
Expand Down
24 changes: 24 additions & 0 deletions docs/validations/validity.md
Original file line number Diff line number Diff line change
Expand Up @@ -505,3 +505,27 @@ validations for product_db.products:
on: percent_negative(price)
threshold: "< 40"
```
## COUNT_NULL_KEYWORD
The count null keyword validation counts the number of null like keyword in a dataset.
**Example**
```yaml title="dcs_config.yaml"
validations for product_db.products:
- count_null_keyword:
on: count_null_keyword(keyword)
threshold: <=10
```
## PERCENT_NULL_KEYWORD
The percent null keyword validation checks the percentage of null like keyword in a dataset.
**Example**
```yaml title="dcs_config.yaml"
validations for product_db.products:
- percent_null_keyword:
on: percent_null_keyboard(keyword)
```
31 changes: 31 additions & 0 deletions tests/core/configuration/test_configuration_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -1025,3 +1025,34 @@ def test_should_parse_percent_negative_validation():
.get_validation_function
== ValidationFunction.PERCENT_NEGATIVE
)


def test_should_parse_count_null_keyword():
yaml_string = """
validations for source.table:
- test:
on: count_null_keyword(keyword)
threshold: <=10
"""
configuration = load_configuration_from_yaml_str(yaml_string)
assert (
configuration.validations["source.table"]
.validations["test"]
.get_validation_function
== ValidationFunction.COUNT_NULL_KEYWORD
)


def test_should_parse_percent_null_keyword():
yaml_string = """
validations for source.table:
- test:
on: percent_null_keyboard(keyword)
"""
configuration = load_configuration_from_yaml_str(yaml_string)
assert (
configuration.validations["source.table"]
.validations["test"]
.get_validation_function
== ValidationFunction.PERCENT_NULL_KEYWORD
)
23 changes: 16 additions & 7 deletions tests/integration/datasource/test_sql_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,8 @@ def setup_tables(
isin VARCHAR(12),
perm_id VARCHAR(50),
salary INTEGER,
price FLOAT
price FLOAT,
null_keyword VARCHAR(50)
)
"""
)
Expand All @@ -144,27 +145,27 @@ def setup_tables(
('thor', '{(utc_now - datetime.timedelta(days=10)).strftime("%Y-%m-%d")}',
1500, NULL, 'thor hammer', 'e7194aaa-5516-4362-a5ff-6ff971976bec',
'123-456-7890', 'jane.doe@domain', 'C2', 'ABCDE', 40.0678, -7555555554.0060,'856-45-6789','0067340',
'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789', 0, 100.0), -- invalid email -- invalid usa_state_code -- invalid usa_zip_code -- invalid cusip -- invalid perm_id
'JRIK0092LOAUCXTR6042','03783310','BBG000B9XRY4','US0378331005', '1234--5678-9012--3456-789', 0, 100.0,'null'), -- invalid email -- invalid usa_state_code -- invalid usa_zip_code -- invalid cusip -- invalid perm_id
('captain america', '{(utc_now - datetime.timedelta(days=3)).strftime("%Y-%m-%d")}',
90, 80, 'shield', 'e7194aaa-5516-4362-a5ff-6ff971976b', '(123) 456-7890',
'john.doe@.com ', 'NY', '12-345', 34.0522, -118.2437,'000-12-3456', 'B01HL06',
'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789', 1000, -50.0), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn
'CDR300OS7OJENVEDDW89','037833100','BBG000BL2H25','US5949181045', '1234567890123456789', 1000, -50.0,'Alvin'), -- invalid weapon_id --invalid email -- invalid usa_zip_code -- invalid ssn
('iron man', '{(utc_now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")}',
50, 70, 'suit', '1739c676-6108-4dd2-8984-2459df744936', '123 456 7890',
'contact@company..org', 'XY', '85001', 37.7749, -122.4194,'859-99-9999','4155586',
'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678', 0, -150.0), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id
'VXQ400F1OBWAVPBJP86','594918104','BBG000B3YB97','US38259P5088', '123456789012345678', 0, -150.0,'nil'), -- invalid email -- invalid usa_state_code -- invalid lei -- invalid perm_id
('hawk eye', '{(utc_now - datetime.timedelta(days=5)).strftime("%Y-%m-%d")}',
40, 60, 'bow', '1739c676-6108-4dd2-8984-2459df746', '+1 123-456-7890',
'user@@example.com', 'TX', '30301', 51.1657, 10.4515,'123-45-67890','12345',
'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291', 90, 50.0), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol
'FKRD00GCEYWDCNYLNF60','38259P508','BBG000B57Y12','US83165F1026', '5647382910564738291', 90, 50.0,'Simon'), -- invalid weapon_id --invalid email -- invalid ssn -- invalid sedol
('clark kent', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}',
35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8', '09123.456.7890',
'contact@company.org', 'ZZ', '123456', 51.5074, -0.1278,'666-45-6789','34A56B7',
'6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X', 0, -25.0), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id
'6R5J00FMIANQQH6JMN56','83165F102','BBG000B9XRY','US0231351067', '1234-5678-9012-3456-78X', 0, -25.0,'None'), -- invalid weapon_id -- invalid phone -- invalid usa_state_code -- invalid usa_zip_code -- invalid ssn -- invalid sedol -- invalid figi -- invalid perm_id
('black widow', '{(utc_now - datetime.timedelta(days=6)).strftime("%Y-%m-%d")}',
35, 50, '', '7be61b2c-45dc-4889-97e3-9202e8032c73', '+1 (123) 456-7890',
'jane_smith123@domain.co.uk', 'FL', '90210', 483.8566, 2.3522,'001-01-0001','456VGHY',
'0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890', 70, 30.0) -- invalid isin -- invalid sedol
'0FPB00BBRHUYOE7DSK19','023135106','BBG000B6R530','US037833100', '2345-6789-0123-4567-890', 70, 30.0,'Ram') -- invalid isin -- invalid sedol
"""

postgresql_connection.execute(text(insert_query))
Expand Down Expand Up @@ -641,3 +642,11 @@ def test_should_return_percent_negative(
table=self.TABLE_NAME, field="price", operation="percent"
)
assert round(percent_negative, 2) == 50.0

def test_should_return_row_count_for_null_keyword(
self, postgres_datasource: PostgresDataSource
):
valid_count = postgres_datasource.query_get_null_keyword_count(
table=self.TABLE_NAME, field="null_keyword", operation="count"
)
assert valid_count == 3

0 comments on commit 93bbdcb

Please sign in to comment.