-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
* upgrade BigQuery Sampler * beautify code * revert old way of profiler & data quality, keep fetch new way sample * Update profiler_source.py * Update profiler_source.py --------- Co-authored-by: hung.duong <hung.duong@be.com.vn> Co-authored-by: Teddy <teddy.crepineau@gmail.com>
- Loading branch information
1 parent
97140e1
commit 64f147c
Showing
6 changed files
with
202 additions
and
4 deletions.
There are no files selected for viewing
Empty file.
52 changes: 52 additions & 0 deletions
52
ingestion/src/metadata/profiler/interface/sqlalchemy/bigquery/profiler_interface.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# Copyright 2021 Collate | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
""" | ||
Interfaces with database for BigQuery engine | ||
supporting sqlalchemy abstraction layer | ||
""" | ||
|
||
from metadata.profiler.interface.sqlalchemy.profiler_interface import ( | ||
SQAProfilerInterface, | ||
) | ||
from metadata.profiler.processor.sqlalchemy.bigquery_sampler import BigQuerySampler | ||
|
||
|
||
class BigQueryProfilerInterface(SQAProfilerInterface): | ||
""" | ||
Interface to interact with BigQuery registry. | ||
""" | ||
|
||
_profiler_type: str = "BigQuery" | ||
|
||
def __init__( | ||
self, | ||
**kwargs, | ||
): | ||
super().__init__(**kwargs) | ||
|
||
def _instantiate_sampler( | ||
self, | ||
session, | ||
table, | ||
sample_columns, | ||
profile_sample_config, | ||
partition_details, | ||
profile_sample_query, | ||
): | ||
return BigQuerySampler( | ||
session=session, | ||
table=table, | ||
sample_columns=sample_columns, | ||
profile_sample_config=profile_sample_config, | ||
partition_details=partition_details, | ||
profile_sample_query=profile_sample_query, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
71 changes: 71 additions & 0 deletions
71
ingestion/src/metadata/profiler/processor/sqlalchemy/bigquery_sampler.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# Copyright 2021 Collate | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
""" | ||
Helper module to handle BigQuery data sampling | ||
for the profiler | ||
""" | ||
|
||
from metadata.generated.schema.entity.data.table import ProfileSampleType, TableData | ||
from metadata.profiler.processor.sqlalchemy.sampler import Sampler | ||
from metadata.profiler.source.bigquery.queries import BIGQUERY_TABLESAMPLE | ||
|
||
|
||
class BigQuerySampler(Sampler): | ||
""" | ||
Generates a sample of the BigQuery to not | ||
run the query in the whole table. | ||
""" | ||
|
||
sample_stmt = BIGQUERY_TABLESAMPLE | ||
default_percent = 10 | ||
|
||
def __init__( | ||
self, | ||
**kwargs, | ||
): | ||
super().__init__(**kwargs) | ||
|
||
def get_bq_sample_query(self) -> str: | ||
"""get query for sample data""" | ||
if self.profile_sample_type == ProfileSampleType.PERCENTAGE: | ||
return self.sample_stmt.format( | ||
table=self.table.__tablename__, | ||
col=", ".join( | ||
"`" + col_name.lower() + "`" for col_name in self.sample_columns | ||
), | ||
relative_table=self.table.__table__, | ||
percent=self.profile_sample, | ||
result_limit=self.sample_limit, | ||
) | ||
|
||
return self.sample_stmt.format( | ||
table=self.table.__tablename__, | ||
col=", ".join( | ||
"`" + col_name.lower() + "`" for col_name in self.sample_columns | ||
), | ||
relative_table=self.table.__table__, | ||
percent=self.default_percent, | ||
result_limit=self.profile_sample, | ||
) | ||
|
||
def fetch_sqa_sample_data(self) -> TableData: | ||
""" | ||
Use the sampler to retrieve sample data rows as per limit given by user | ||
:return: TableData to be added to the Table Entity | ||
""" | ||
if self._profile_sample_query: | ||
return self._fetch_sample_data_from_user_query() | ||
|
||
bq_sample = self.session.execute(self.get_bq_sample_query()) | ||
return TableData( | ||
columns=list(self.sample_columns), | ||
rows=[list(row) for row in bq_sample], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
26 changes: 26 additions & 0 deletions
26
ingestion/src/metadata/profiler/source/bigquery/queries.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# Copyright 2021 Collate | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
""" | ||
BigQuery Queries for fetching sample data | ||
""" | ||
|
||
import textwrap | ||
|
||
BIGQUERY_TABLESAMPLE = textwrap.dedent( | ||
""" | ||
WITH {table}_cte AS ( | ||
SELECT {col} | ||
FROM `{relative_table}` TABLESAMPLE SYSTEM ({percent} PERCENT) | ||
) | ||
SELECT * FROM {table}_cte | ||
LIMIT {result_limit} | ||
""" | ||
) |