This page describes some basic use cases of the tool.
PLEASE NOTE: In below commands, my_bq_conn refers to the connection name for your BigQuery project. We are validating BigQuery tables that are available in BigQuery public datasets. These examples validate a table against itself for example purposes.
Also, note that if no aggregation flag is provided, the tool will run a 'COUNT *' as the default aggregation.
data-validation validate column -sc my_bq_conn -tc my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_trips
data-validation validate column -sc my_bq_conn -tc my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_trips,bigquery-public-data.new_york_citibike.citibike_stations
data-validation validate column -sc my_bq_conn -tc my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_trips -c citibike.yaml
Above command creates a YAML file named citibike.yaml that can be used to run validations in the future.
Please note: When the config-file (-c) option is provided, only the YAML file gets created. The validation doesn’t execute.
data-validation run-config -c citibike.yaml
Above command executes validations stored in a config file named citibike.yaml.
data-validation validate column -sc my_bq_conn -tc my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_trips --count '*'
data-validation validate column -sc my_bq_conn -tc my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_trips --count bikeid,gender
data-validation validate row -sc my_bq_conn -tc my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_trips --primary-keys station_id --hash '*'
data-validation validate column -sc my_bq_conn -tc my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_trips --count tripduration,start_station_name -bqrh $YOUR_PROJECT_ID.pso_data_validator.results
Please replace $YOUR_PROJECT_ID with the correct project-id where you created your results datasets as mentioned in the installation section.
SELECT
run_id,
validation_name,
validation_type,
group_by_columns,
source_table_name,
source_agg_value,
target_agg_value,
pct_difference,
pct_threshold,
difference,
start_time
FROM
`pso_data_validator.results`
ORDER BY
start_time DESC
data-validation validate column -sc my_bq_conn -tc my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_trips --grouped-columns bikeid
data-validation validate column -sc my_bq_conn -tc my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_trips --grouped-columns bikeid,usertype
data-validation validate column -sc my_bq_conn -tc my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_stations --sum num_bikes_available
data-validation validate column -sc my_bq_conn -tc my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_stations --sum num_bikes_available,num_docks_available
data-validation validate column -sc my_bq_conn -tc my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_stations --sum num_bikes_available,num_docks_available --avg num_bikes_disabled,num_docks_disabled
data-validation validate column -sc my_bq_conn -tc my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_stations --grouped-columns region_id --sum num_bikes_available,num_docks_available --avg num_bikes_disabled,num_docks_disabled
data-validation validate column -sc my_bq_conn -tc my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_stations --grouped-columns region_id --sum num_bikes_available,num_docks_available --filters 'region_id=71' -bqrh $YOUR_PROJECT_ID.pso_data_validator.results
data-validation validate column -sc my_bq_conn -tc my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_trips --count tripduration,start_station_name -l tag=test-run,owner=name
data-validation validate schema -sc my_bq_conn -tc my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_trips -bqrh $YOUR_PROJECT_ID.pso_data_validator.results
# Additional dependencies needed for GCS files
pip install gcsfs
pip install fsspec
data-validation connections add --connection-name file_conn FileSystem --table-name $FILE_NAME --file-path gs://path/to/file --file-type csv
data-validation connections add --connection-name my_bq_conn BigQuery --project-id $YOUR_PROJECT_ID
# Validate GCS CSV file with BigQuery table
data-validation validate column -sc file_conn -tc my_bq_conn -tbls $FILE_NAME=$YOUR_PROJECT_ID.dataset.table --count $COLUMN
data-validation query
--conn connection-name The named connection to be queried.
--query, -q The Raw query to run against the supplied connection
result_handler:
project_id: my-project-id
table_id: pso_data_validator.results
type: BigQuery
google_service_account_key_path: path/to/sa.json
source: my_bq_conn
target: my_bq_conn
validations:
- aggregates:
- field_alias: count
source_column: null
target_column: null
type: count
- field_alias: sum__num_bikes_available
source_column: num_bikes_available
target_column: num_bikes_available
type: sum
cast: float64
- field_alias: sum__num_docks_available
source_column: num_docks_available
target_column: num_docks_available
type: sum
filters:
- source: region_id=71
target: region_id=71
type: custom
grouped_columns:
- cast: null
field_alias: region_id
source_column: region_id
target_column: region_id
labels:
- !!python/tuple
- description
- test
schema_name: bigquery-public-data.new_york_citibike
table_name: citibike_stations
target_schema_name: bigquery-public-data.new_york_citibike
target_table_name: citibike_stations
threshold: 0.0
type: Column