diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 7a9e8f785a08..d1bc3385dc53 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -26,7 +26,6 @@ /dolphinscheduler-common/ @SbloodyS /dolphinscheduler-dao/ @SbloodyS @ruanwenjun /dolphinscheduler-dao-plugin/ @SbloodyS @ruanwenjun -/dolphinscheduler-data-quality/ @SbloodyS /dolphinscheduler-datasource-plugin/ @SbloodyS /dolphinscheduler-dist/ @SbloodyS /dolphinscheduler-e2e/ @SbloodyS diff --git a/.github/actions/labeler/labeler.yml b/.github/actions/labeler/labeler.yml index 6bd9b6daf1e5..de0f8319bf46 100644 --- a/.github/actions/labeler/labeler.yml +++ b/.github/actions/labeler/labeler.yml @@ -23,7 +23,6 @@ backend: - 'dolphinscheduler-common/**/*' - 'dolphinscheduler-dao/**/*' - 'dolphinscheduler-dao-plugin/**/*' - - 'dolphinscheduler-data-quality/**/*' - 'dolphinscheduler-datasource-plugin/**/*' - 'dolphinscheduler-dist/**/*' - 'dolphinscheduler-extract/**/*' diff --git a/config/plugins_config b/config/plugins_config index 6fac612b01c0..eff859100fe5 100644 --- a/config/plugins_config +++ b/config/plugins_config @@ -77,7 +77,6 @@ dolphinscheduler-storage-s3 dolphinscheduler-task-aliyunserverlessspark dolphinscheduler-task-chunjun dolphinscheduler-task-datafactory -dolphinscheduler-task-dataquality dolphinscheduler-task-datasync dolphinscheduler-task-datax dolphinscheduler-task-dinky diff --git a/deploy/kubernetes/dolphinscheduler/README.md b/deploy/kubernetes/dolphinscheduler/README.md index 4a38a5e4d3f3..d64053ffdd27 100644 --- a/deploy/kubernetes/dolphinscheduler/README.md +++ b/deploy/kubernetes/dolphinscheduler/README.md @@ -131,7 +131,6 @@ Please refer to the [Quick Start in Kubernetes](../../../docs/docs/en/guide/inst | conf.common."aws.s3.endpoint" | string | `"http://minio:9000"` | You need to set this parameter when private cloud s3. If S3 uses public cloud, you only need to set resource.aws.region or set to the endpoint of a public cloud such as S3.cn-north-1.amazonaws.com.cn | | conf.common."aws.s3.region" | string | `"ca-central-1"` | The AWS Region to use. if resource.storage.type=S3, This configuration is required | | conf.common."conda.path" | string | `"/opt/anaconda3/etc/profile.d/conda.sh"` | set path of conda.sh | -| conf.common."data-quality.jar.dir" | string | `nil` | data quality option | | conf.common."data.basedir.path" | string | `"/tmp/dolphinscheduler"` | user data local directory path, please make sure the directory exists and have read write permissions | | conf.common."datasource.encryption.enable" | bool | `false` | datasource encryption enable | | conf.common."datasource.encryption.salt" | string | `"!@#$%^&*"` | datasource encryption salt | diff --git a/deploy/kubernetes/dolphinscheduler/values.yaml b/deploy/kubernetes/dolphinscheduler/values.yaml index 41a7dfb9c286..5658a29c1189 100644 --- a/deploy/kubernetes/dolphinscheduler/values.yaml +++ b/deploy/kubernetes/dolphinscheduler/values.yaml @@ -336,9 +336,6 @@ conf: # -- datasource encryption salt datasource.encryption.salt: '!@#$%^&*' - # -- data quality option - data-quality.jar.dir: - # -- Whether hive SQL is executed in the same session support.hive.oneSession: false @@ -987,7 +984,6 @@ api: # cloud: [] # logic: [] # dataIntegration: [] - # dataQuality: [] # machineLearning: [] # other: [] diff --git a/docs/configs/docsdev.js b/docs/configs/docsdev.js index 8cf9d4d2f92a..0fa7adf2612d 100644 --- a/docs/configs/docsdev.js +++ b/docs/configs/docsdev.js @@ -457,10 +457,6 @@ export default { } ], }, - { - title: 'Data Quality', - link: '/en-us/docs/dev/user_doc/guide/data-quality.html', - }, { title: 'Remote Logging', link: '/en-us/docs/dev/user_doc/guide/remote-logging.html', @@ -1160,10 +1156,6 @@ export default { } ], }, - { - title: '数据质量', - link: '/zh-cn/docs/dev/user_doc/guide/data-quality.html', - }, { title: '远程日志存储', link: '/zh-cn/docs/dev/user_doc/guide/remote-logging.html', diff --git a/docs/docs/en/architecture/configuration.md b/docs/docs/en/architecture/configuration.md index 567163faed19..86d4357e1bd4 100644 --- a/docs/docs/en/architecture/configuration.md +++ b/docs/docs/en/architecture/configuration.md @@ -224,7 +224,6 @@ The default configuration is as follows: | yarn.job.history.status.address | http://ds1:19888/ws/v1/history/mapreduce/jobs/%s | job history status url of yarn | | datasource.encryption.enable | false | whether to enable datasource encryption | | datasource.encryption.salt | !@#$%^&* | the salt of the datasource encryption | -| data-quality.jar.dir | | the jar of data quality | | support.hive.oneSession | false | specify whether hive SQL is executed in the same session | | sudo.enable | true | whether to enable sudo | | alert.rpc.port | 50052 | the RPC port of Alert Server | diff --git a/docs/docs/en/guide/data-quality.md b/docs/docs/en/guide/data-quality.md deleted file mode 100644 index dca777d76fb8..000000000000 --- a/docs/docs/en/guide/data-quality.md +++ /dev/null @@ -1,313 +0,0 @@ -# Data Quality - -## Introduction - -The data quality task is used to check the data accuracy during the integration and processing of data. Data quality tasks in this release include single-table checking, single-table custom SQL checking, multi-table accuracy, and two-table value comparisons. The running environment of the data quality task is Spark 2.4.0, and other versions have not been verified, and users can verify by themselves. - -The execution logic of the data quality task is as follows: - -- The user defines the task in the interface, and the user input value is stored in `TaskParam`. -- When running a task, `Master` will parse `TaskParam`, encapsulate the parameters required by `DataQualityTask` and send it to `Worker`. -- Worker runs the data quality task. After the data quality task finishes running, it writes the statistical results to the specified storage engine. -- The current data quality task result is stored in the `t_ds_dq_execute_result` table of `dolphinscheduler` - `Worker` sends the task result to `Master`, after `Master` receives `TaskResponse`, it will judge whether the task type is `DataQualityTask`, if so, it will read the corresponding result from `t_ds_dq_execute_result` according to `taskInstanceId`, and then The result is judged according to the check mode, operator and threshold configured by the user. -- If the result is a failure, the corresponding operation, alarm or interruption will be performed according to the failure policy configured by the user. -- If you package `data-quality` separately, remember to modify the package name to be consistent with `data-quality.jar.dir` in `common.properties` with attribute name `data-quality.jar.dir` -- If the old version is upgraded and used, you need to execute the `sql` update script to initialize the database before running. -- `dolphinscheduler-data-quality-dev-SNAPSHOT.jar` was built with no dependencies. If a `JDBC` driver is required, you can set the `-jars` parameter in the `node settings` `Option Parameters`, e.g. `--jars /lib/jars/mysql-connector-java-8.0.16.jar`. -- Currently only `MySQL`, `PostgreSQL` and `HIVE` data sources have been tested, other data sources have not been tested yet. -- `Spark` needs to be configured to read `Hive` metadata, `Spark` does not use `jdbc` to read `Hive`. - -## Detailed Inspection Logic - -| **Parameter** | **Description** | -|---------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| CheckMethod | [CheckFormula][Operator][Threshold], if the result is true, it indicates that the data does not meet expectations, and the failure strategy is executed. | -| CheckFormula | | -| Operator | =, >, >=, <, <=, != | -| ExpectedValue | | -| Example | | - -In the example, assuming that the actual value is 10, the operator is >, and the expected value is 9, then the result 10 -9 > 0 is true, which means that the row data in the empty column has exceeded the threshold, and the task is judged to fail. - -# Task Operation Guide - -## Null Value Check for Single Table Check - -### Inspection Introduction - -The goal of the null value check is to check the number of empty rows in the specified column. The number of empty rows can be compared with the total number of rows or a specified threshold. If it is greater than a certain threshold, it will be judged as failure. - -- The SQL statement that calculates the null of the specified column is as follows: - - ```sql - SELECT COUNT(*) AS miss FROM ${src_table} WHERE (${src_field} is null or ${src_field} = '') AND (${src_filter}) - ``` -- The SQL to calculate the total number of rows in the table is as follows: - - ```sql - SELECT COUNT(*) AS total FROM ${src_table} WHERE (${src_filter}) - ``` - -### Interface Operation Guide - -![dataquality_null_check](../../../img/tasks/demo/null_check.png) - -| **Parameter** | **Description** | -|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Source data type | Select MySQL, PostgreSQL, etc. | -| Source data source | The corresponding data source under the source data type. | -| Source data table | Drop-down to select the table where the validation data is located. | -| Src filter conditions | Such as the title, it will also be used when counting the total number of rows in the table, optional. | -| Src table check column | Drop-down to select the check column name. | -| Check method | | -| Check operators | =, >, >=, <, <=, ! = | -| Threshold | The value used in the formula for comparison. | -| Failure strategy | | -| Expected value type | Select the desired type from the drop-down menu. | - -## Timeliness Check of Single Table Check - -### Inspection Introduction - -The timeliness check is used to check whether the data is processed within the expected time. The start time and end time can be specified to define the time range. If the amount of data within the time range does not reach the set threshold, the check task will be judged as fail. - -### Interface Operation Guide - -![dataquality_timeliness_check](../../../img/tasks/demo/timeliness_check.png) - -| **Parameter** | **Description** | -|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Source data type | Select MySQL, PostgreSQL, etc. | -| Source data source | The corresponding data source under the source data type. | -| Source data table | Drop-down to select the table where the validation data is located. | -| Src filter conditions | Such as the title, it will also be used when counting the total number of rows in the table, optional. | -| Src table check column | Drop-down to select check column name. | -| Start time | The start time of a time range. | -| end time | The end time of a time range. | -| Time Format | Set the corresponding time format. | -| Check method | | -| Check operators | =, >, >=, <, <=, ! = | -| Threshold | The value used in the formula for comparison. | -| Failure strategy | | -| Expected value type | Select the desired type from the drop-down menu. | - -## Field Length Check for Single Table Check - -### Inspection Introduction - -The goal of field length verification is to check whether the length of the selected field meets the expectations. If there is data that does not meet the requirements, and the number of rows exceeds the threshold, the task will be judged to fail. - -### Interface Operation Guide - -![dataquality_length_check](../../../img/tasks/demo/field_length_check.png) - -| **Parameter** | **Description** | -|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Source data type | Select MySQL, PostgreSQL, etc. | -| Source data source | The corresponding data source under the source data type. | -| Source data table | Drop-down to select the table where the validation data is located. | -| Src filter conditions | Such as the title, it will also be used when counting the total number of rows in the table, optional. | -| Src table check column | Drop-down to select the check column name. | -| Logical operators | =, >, >=, <, <=, ! = | -| Field length limit | Like the title. | -| Check method | | -| Check operators | =, >, >=, <, <=, ! = | -| Threshold | The value used in the formula for comparison. | -| Failure strategy | | -| Expected value type | Select the desired type from the drop-down menu. | - -## Uniqueness Check for Single Table Check - -### Inspection Introduction - -The goal of the uniqueness check is to check whether the fields are duplicated. It is generally used to check whether the primary key is duplicated. If there are duplicates and the threshold is reached, the check task will be judged to be failed. - -### Interface Operation Guide - -![dataquality_uniqueness_check](../../../img/tasks/demo/uniqueness_check.png) - -| **Parameter** | **Description** | -|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Source data type | Select MySQL, PostgreSQL, etc. | -| Source data source | The corresponding data source under the source data type. | -| Source data table | Drop-down to select the table where the validation data is located. | -| Src filter conditions | Such as the title, it will also be used when counting the total number of rows in the table, optional. | -| Src table check column | Drop-down to select the check column name. | -| Check method | | -| Check operators | =, >, >=, <, <=, ! = | -| Threshold | The value used in the formula for comparison. | -| Failure strategy | | -| Expected value type | Select the desired type from the drop-down menu. | - -## Regular Expression Check for Single Table Check - -### Inspection Introduction - -The goal of regular expression verification is to check whether the format of the value of a field meets the requirements, such as time format, email format, ID card format, etc. If there is data that does not meet the format and exceeds the threshold, the task will be judged as failed. - -### Interface Operation Guide - -![dataquality_regex_check](../../../img/tasks/demo/regexp_check.png) - -| **Parameter** | **Description** | -|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Source data type | Select MySQL, PostgreSQL, etc. | -| Source data source | The corresponding data source under the source data type. | -| Source data table | Drop-down to select the table where the validation data is located. | -| Src filter conditions | Such as the title, it will also be used when counting the total number of rows in the table, optional. | -| Src table check column | Drop-down to select check column name. | -| Regular expression | As title. | -| Check method | | -| Check operators | =, >, >=, <, <=, ! = | -| Threshold | The value used in the formula for comparison. | -| Failure strategy | | -| Expected value type | Select the desired type from the drop-down menu. | - -## Enumeration Value Validation for Single Table Check - -### Inspection Introduction - -The goal of enumeration value verification is to check whether the value of a field is within the range of the enumeration value. If there is data that is not in the range of the enumeration value and exceeds the threshold, the task will be judged to fail. - -### Interface Operation Guide - -![dataquality_enum_check](../../../img/tasks/demo/enumeration_check.png) - -| **Parameter** | **Description** | -|-----------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Source data type | Select MySQL, PostgreSQL, etc. | -| Source data source | The corresponding data source under the source data type. | -| Source data table | Drop-down to select the table where the validation data is located. | -| Src table filter conditions | Such as title, also used when counting the total number of rows in the table, optional. | -| Src table check column | Drop-down to select the check column name. | -| List of enumeration values | Separated by commas. | -| Check method | | -| Check operators | =, >, >=, <, <=, ! = | -| Threshold | The value used in the formula for comparison. | -| Failure strategy | | -| Expected value type | Select the desired type from the drop-down menu. | - -## Table Row Number Verification for Single Table Check - -### Inspection Introduction - -The goal of table row number verification is to check whether the number of rows in the table reaches the expected value. If the number of rows does not meet the standard, the task will be judged as failed. - -### Interface Operation Guide - -![dataquality_count_check](../../../img/tasks/demo/table_count_check.png) - -| **Parameter** | **Description** | -|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Source data type | Select MySQL, PostgreSQL, etc. | -| Source data source | The corresponding data source under the source data type. | -| Source data table | Drop-down to select the table where the validation data is located. | -| Src filter conditions | Such as the title, it will also be used when counting the total number of rows in the table, optional. | -| Src table check column | Drop-down to select the check column name. | -| Check method | | -| Check operators | =, >, >=, <, <=, ! = | -| Threshold | The value used in the formula for comparison. | -| Failure strategy | | -| Expected value type | Select the desired type from the drop-down menu. | - -## Custom SQL Check for Single Table Check - -### Interface Operation Guide - -![dataquality_custom_sql_check](../../../img/tasks/demo/custom_sql_check.png) - -| **Parameter** | **Description** | -|------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Source data type | Select MySQL, PostgreSQL, etc. | -| Source data source | The corresponding data source under the source data type. | -| Source data table | Drop-down to select the table where the data to be verified is located. | -| Actual value name | Alias in SQL for statistical value calculation, such as max_num. | -| Actual value calculation SQL | SQL for outputting actual values. Note: | -| Src filter conditions | Such as the title, it will also be used when counting the total number of rows in the table, optional. | -| Check method | | -| Check operators | =, >, >=, <, <=, ! = | -| Threshold | The value used in the formula for comparison. | -| Failure strategy | | -| Expected value type | Select the desired type from the drop-down menu. | - -## Accuracy Check of Multi-table - -### Inspection Introduction - -Accuracy checks are performed by comparing the accuracy differences of data records for selected fields between two tables, examples are as follows -- table test1 - -| c1 | c2 | -|:--:|:--:| -| a | 1 | -| b | 2 | - -- table test2 - -| c21 | c22 | -|:---:|:---:| -| a | 1 | -| b | 3 | - -If you compare the data in c1 and c21, the tables test1 and test2 are exactly the same. If you compare c2 and c22, the data in table test1 and table test2 are inconsistent. - -### Interface Operation Guide - -![dataquality_multi_table_accuracy_check](../../../img/tasks/demo/multi_table_accuracy_check.png) - -| **Parameter** | **Description** | -|--------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Source data type | Select MySQL, PostgreSQL, etc. | -| Source data source | The corresponding data source under the source data type. | -| Source data table | Drop-down to select the table where the data to be verified is located. | -| Src filter conditions | Such as the title, it will also be used when counting the total number of rows in the table, optional. | -| Target data type | Choose MySQL, PostgreSQL, etc. | -| Target data source | The corresponding data source under the source data type. | -| Target data table | Drop-down to select the table where the data to be verified is located. | -| Target filter conditions | Such as the title, it will also be used when counting the total number of rows in the table, optional. | -| Check column | Fill in the source data column, operator and target data column respectively. | -| Verification method | Select the desired verification method. | -| Operators | =, >, >=, <, <=, ! = | -| Failure strategy |