Skip to content

Commit

Permalink
Rule miner finds regular expressions.
Browse files Browse the repository at this point in the history
  • Loading branch information
piotrczarnas committed Aug 18, 2024
1 parent ae2b29d commit c8c4a55
Show file tree
Hide file tree
Showing 12 changed files with 381 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ export default function RuleMining({
propose_top_values_checks: true,
propose_text_conversion_checks: true,
propose_standard_pattern_checks: true,
detect_regular_expressions: true,
propose_whitespace_checks: true,
apply_pii_checks: true,
values_in_set_treat_rare_values_as_invalid: true,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,15 @@ export default function RuleMiningFilters({
onChangeConfiguration({ propose_standard_pattern_checks: e })
}
/>
<Checkbox
className="p-2 !w-62"
tooltipText="Analyze sample text values and try to find a regular expression that detects valid values similar to the sample values."
label="Detect regular expressions"
checked={configuration.detect_regular_expressions}
onChange={(e) =>
onChangeConfiguration({ detect_regular_expressions: e })
}
/>
<Checkbox
className="p-2 !w-62"
tooltipText="Propose the default configuration for the whitespace detection checks. Whitespace checks detect common data quality issues with storing text representations of null values, such as 'null', 'None', 'n/a' and other texts that should be stored as regular NULL values."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ public boolean proposeCheckConfiguration(ProfilingCheckResult sourceProfilingChe
DqoRuleMiningConfigurationProperties checkMiningConfigurationProperties,
JsonSerializer jsonSerializer,
RuleMiningRuleRegistry ruleMiningRuleRegistry) {
if (!miningParameters.isProposeStandardPatternChecks()) {
if (!miningParameters.isDetectRegularExpressions()) {
return false;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import com.dqops.checks.AbstractCheckSpec;
import com.dqops.checks.AbstractRootChecksContainerSpec;
import com.dqops.checks.CheckType;
import com.dqops.checks.DefaultDataQualityDimensions;
import com.dqops.connectors.DataTypeCategory;
import com.dqops.core.configuration.DqoRuleMiningConfigurationProperties;
Expand All @@ -27,6 +28,8 @@
import com.dqops.sensors.column.patterns.ColumnPatternsTextsNotMatchingRegexPercentSensorParametersSpec;
import com.dqops.services.check.mapping.models.CheckModel;
import com.dqops.services.check.mining.*;
import com.dqops.services.check.mining.regex.CommonRegexPatternAnalyzer;
import com.dqops.services.check.mining.regex.RegexPatternAnalyzerParameters;
import com.dqops.utils.serialization.IgnoreEmptyYamlSerializer;
import com.dqops.utils.serialization.JsonSerializer;
import com.fasterxml.jackson.annotation.JsonIgnore;
Expand All @@ -36,8 +39,12 @@
import com.fasterxml.jackson.databind.annotation.JsonNaming;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import lombok.EqualsAndHashCode;
import org.apache.parquet.Strings;

import java.time.Instant;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;

/**
* This check validates text values using a pattern defined as a regular expression.
Expand Down Expand Up @@ -223,10 +230,60 @@ public boolean proposeCheckConfiguration(ProfilingCheckResult sourceProfilingChe
DqoRuleMiningConfigurationProperties checkMiningConfigurationProperties,
JsonSerializer jsonSerializer,
RuleMiningRuleRegistry ruleMiningRuleRegistry) {
if (!miningParameters.isProposeStandardPatternChecks()) {
if (!miningParameters.isDetectRegularExpressions()) {
return false;
}

CheckType checkType = parentCheckRootContainer.getCheckType();
if (checkType != CheckType.profiling && sourceProfilingCheck.getProfilingCheckModel() != null &&
sourceProfilingCheck.getProfilingCheckModel().getRule().hasAnyRulesConfigured()) {
// copy the results from an already configured profiling checks
return super.proposeCheckConfiguration(sourceProfilingCheck, dataAssetProfilingResults, tableProfilingResults,
tableSpec, parentCheckRootContainer, myCheckModel, miningParameters,
columnTypeCategory, checkMiningConfigurationProperties, jsonSerializer, ruleMiningRuleRegistry);
}

if (!(dataAssetProfilingResults instanceof ColumnDataAssetProfilingResults)) {
return false;
}

ColumnDataAssetProfilingResults columnDataAssetProfilingResults = (ColumnDataAssetProfilingResults) dataAssetProfilingResults;
if (sourceProfilingCheck.getActualValue() == null && Strings.isNullOrEmpty(this.parameters.getRegex())) {
if (columnTypeCategory != null && columnTypeCategory != DataTypeCategory.text) {
return false;
}

Double percentOfStringValues = columnDataAssetProfilingResults.matchPercentageOfSamples(value -> {
return value instanceof String && !Objects.equals("", value);
});

if (percentOfStringValues == null || percentOfStringValues < 100.0) {
return false;
}

List<String> listOfTextSamples = columnDataAssetProfilingResults.getSampleValues().stream()
.map(profilingSampleValue -> profilingSampleValue.getValue().toString())
.collect(Collectors.toList());

RegexPatternAnalyzerParameters analyzerParameters = new RegexPatternAnalyzerParameters(); // default parameters
String commonRegex = CommonRegexPatternAnalyzer.findCommonRegex(listOfTextSamples, analyzerParameters);

if (commonRegex == null) {
return false;
}

String beginEndRegex = "^(" + commonRegex + ")$";
this.parameters.setRegex(beginEndRegex);

sourceProfilingCheck.setActualValue(0.0); // just fake number like there were no invalid values, to enable a check, even if it fails, we cannot calculate a correct value from the samples
sourceProfilingCheck.setExecutedAt(Instant.now());
}

if (sourceProfilingCheck.getActualValue() != null && sourceProfilingCheck.getActualValue() > miningParameters.getMaxPercentErrorRowsForPercentChecks()) {
return false; // do not configure this check, when the value was captured and there are too many future values
}


return super.proposeCheckConfiguration(sourceProfilingCheck, dataAssetProfilingResults, tableProfilingResults,
tableSpec, parentCheckRootContainer, myCheckModel, miningParameters, columnTypeCategory,
checkMiningConfigurationProperties, jsonSerializer, ruleMiningRuleRegistry);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
*/
package com.dqops.sensors.column.patterns;

import com.dqops.metadata.fields.ControlDisplayHint;
import com.dqops.metadata.fields.DisplayHint;
import com.dqops.metadata.fields.SampleValues;
import com.dqops.metadata.id.ChildHierarchyNodeFieldMap;
import com.dqops.metadata.id.ChildHierarchyNodeFieldMapImpl;
Expand Down Expand Up @@ -43,6 +45,7 @@ public class ColumnPatternsTextNotMatchingRegexCountSensorParametersSpec extends

@JsonPropertyDescription("This field can be used to define custom regex. In order to define custom regex, user should write correct regex as a string. If regex is not defined by user then default regex is null")
@SampleValues(values = { "^((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[0-9][0-9]|[0-9])[.]){3}(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[0-9][0-9]|[0-9])$" })
@ControlDisplayHint(DisplayHint.textarea)
@RequiredField
private String regex;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

package com.dqops.sensors.column.patterns;

import com.dqops.metadata.fields.ControlDisplayHint;
import com.dqops.metadata.fields.DisplayHint;
import com.dqops.metadata.fields.SampleValues;
import com.dqops.metadata.id.ChildHierarchyNodeFieldMap;
import com.dqops.metadata.id.ChildHierarchyNodeFieldMapImpl;
Expand Down Expand Up @@ -44,6 +46,7 @@ public class ColumnPatternsTextsNotMatchingRegexPercentSensorParametersSpec exte

@JsonPropertyDescription("This field can be used to define custom regex. In order to define custom regex, user should write correct regex as a string. If regex is not defined by user then default regex is null")
@SampleValues(values = { "^((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[0-9][0-9]|[0-9])[.]){3}(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[0-9][0-9]|[0-9])$" })
@ControlDisplayHint(DisplayHint.textarea)
@RequiredField
private String regex;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,12 @@ public class CheckMiningParametersModel implements Cloneable {
@JsonPropertyDescription("Propose the default configuration for the patterns check that validate the format of popular text patterns, such as UUIDs, phone numbers, or emails. DQOps will configure these data quality checks when analyzed columns contain enough values matching a standard pattern. The default value of this parameter is 'true'.")
private boolean proposeStandardPatternChecks = true;

/**
* Analyze sample text values and try to find a regular expression that detects valid values similar to the sample values. The default value of this parameter is 'true'.
*/
@JsonPropertyDescription("Analyze sample text values and try to find a regular expression that detects valid values similar to the sample values. The default value of this parameter is 'true'.")
private boolean detectRegularExpressions = true;

/**
* Propose the default configuration for the whitespace detection checks. Whitespace checks detect common data quality issues with storing text representations of null values, such as 'null', 'None', 'n/a' and other texts that should be stored as regular NULL values. The default value of this parameter is 'true'.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19825,6 +19825,10 @@
"type" : "boolean",
"description" : "Propose the default configuration for the patterns check that validate the format of popular text patterns, such as UUIDs, phone numbers, or emails. DQOps will configure these data quality checks when analyzed columns contain enough values matching a standard pattern. The default value of this parameter is 'true'."
},
"detect_regular_expressions" : {
"type" : "boolean",
"description" : "Analyze sample text values and try to find a regular expression that detects valid values similar to the sample values. The default value of this parameter is 'true'."
},
"propose_whitespace_checks" : {
"type" : "boolean",
"description" : "Propose the default configuration for the whitespace detection checks. Whitespace checks detect common data quality issues with storing text representations of null values, such as 'null', 'None', 'n/a' and other texts that should be stored as regular NULL values. The default value of this parameter is 'true'."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16834,6 +16834,11 @@ definitions:
\ or emails. DQOps will configure these data quality checks when analyzed\
\ columns contain enough values matching a standard pattern. The default\
\ value of this parameter is 'true'."
detect_regular_expressions:
type: "boolean"
description: "Analyze sample text values and try to find a regular expression\
\ that detects valid values similar to the sample values. The default value\
\ of this parameter is 'true'."
propose_whitespace_checks:
type: "boolean"
description: "Propose the default configuration for the whitespace detection\
Expand Down
Loading

0 comments on commit c8c4a55

Please sign in to comment.