From a3d6c1dd20c6c290b4d223c41d515387f2675f05 Mon Sep 17 00:00:00 2001 From: Imri Paran Date: Thu, 12 Sep 2024 07:13:01 +0200 Subject: [PATCH] MINOR: tests(datalake): use minio (#17805) * tests(datalake): use minio 1. use minio instead of moto for mimicking s3 behavior. 2. removed moto dependency as it is not compatible with aiobotocore (https://github.com/getmoto/moto/issues/7070#issuecomment-1828484982) * - moved test_datalake_profiler_e2e.py to datalake/test_profiler - use minio instead of moto * fixed tests * fixed tests * removed default name for minio container --- ingestion/setup.py | 8 +- ingestion/tests/integration/conftest.py | 3 +- ingestion/tests/integration/containers.py | 2 +- .../tests/integration/datalake/conftest.py | 93 ++-- .../resources/profiler_test_.csv | 0 .../datalake/test_datalake_profiler_e2e.py | 311 +++++++++++++ .../integration/datalake/test_ingestion.py | 19 +- .../test_datalake_profiler_e2e.py | 440 ------------------ .../sources/database/delta_lake/conftest.py | 2 +- .../profiler/pandas/test_custom_metrics.py | 43 -- 10 files changed, 375 insertions(+), 546 deletions(-) rename ingestion/tests/integration/{orm_profiler => datalake}/resources/profiler_test_.csv (100%) create mode 100644 ingestion/tests/integration/datalake/test_datalake_profiler_e2e.py delete mode 100644 ingestion/tests/integration/orm_profiler/test_datalake_profiler_e2e.py diff --git a/ingestion/setup.py b/ingestion/setup.py index aa5bd5aa27cf..84e2f6025274 100644 --- a/ingestion/setup.py +++ b/ingestion/setup.py @@ -207,11 +207,8 @@ *COMMONS["datalake"], }, "datalake-s3": { - # requires aiobotocore - # https://github.com/fsspec/s3fs/blob/9bf99f763edaf7026318e150c4bd3a8d18bb3a00/requirements.txt#L1 - # however, the latest version of `s3fs` conflicts its `aiobotocore` dep with `boto3`'s dep on `botocore`. - # Leaving this marked to the automatic resolution to speed up installation. - "s3fs", + # vendoring 'boto3' to keep all dependencies aligned (s3fs, boto3, botocore, aiobotocore) + "s3fs[boto3]", *COMMONS["datalake"], }, "deltalake": {"delta-spark<=2.3.0", "deltalake~=0.17"}, @@ -343,7 +340,6 @@ "coverage", # Install GE because it's not in the `all` plugin VERSIONS["great-expectations"], - "moto~=5.0", "basedpyright~=1.14", "pytest==7.0.0", "pytest-cov", diff --git a/ingestion/tests/integration/conftest.py b/ingestion/tests/integration/conftest.py index 3c987e5aa68f..81f19a2ea878 100644 --- a/ingestion/tests/integration/conftest.py +++ b/ingestion/tests/integration/conftest.py @@ -15,7 +15,8 @@ from metadata.workflow.ingestion import IngestionWorkflow if not sys.version_info >= (3, 9): - collect_ignore = ["trino", "kafka"] + # these tests use test-containers which are not supported in python 3.8 + collect_ignore = ["trino", "kafka", "datalake"] @pytest.fixture(scope="session", autouse=True) diff --git a/ingestion/tests/integration/containers.py b/ingestion/tests/integration/containers.py index 9483f2468a4d..3bf46b799c18 100644 --- a/ingestion/tests/integration/containers.py +++ b/ingestion/tests/integration/containers.py @@ -53,7 +53,7 @@ class MinioContainerConfigs: access_key: str = "minio" secret_key: str = "password" port: int = 9000 - container_name: str = "test-minio" + container_name: Optional[str] = None exposed_port: Optional[int] = None def with_exposed_port(self, container): diff --git a/ingestion/tests/integration/datalake/conftest.py b/ingestion/tests/integration/datalake/conftest.py index 1ed88fa8ffb6..337bea1081af 100644 --- a/ingestion/tests/integration/datalake/conftest.py +++ b/ingestion/tests/integration/datalake/conftest.py @@ -14,16 +14,16 @@ import os from copy import deepcopy -import boto3 import pytest -from moto import mock_aws from metadata.generated.schema.entity.services.databaseService import DatabaseService from metadata.workflow.data_quality import TestSuiteWorkflow from metadata.workflow.metadata import MetadataWorkflow from metadata.workflow.profiler import ProfilerWorkflow -BUCKET_NAME = "MyBucket" +from ..containers import MinioContainerConfigs, get_minio_container + +BUCKET_NAME = "my-bucket" INGESTION_CONFIG = { "source": { @@ -77,7 +77,7 @@ "sourceConfig": { "config": { "type": "TestSuite", - "entityFullyQualifiedName": 'datalake_for_integration_tests.default.MyBucket."users.csv"', + "entityFullyQualifiedName": f'datalake_for_integration_tests.default.{BUCKET_NAME}."users.csv"', } }, }, @@ -128,31 +128,19 @@ } -@pytest.fixture(scope="module", autouse=True) -def aws(): - with mock_aws(): - yield boto3.client("s3", region_name="us-east-1") +@pytest.fixture(scope="session") +def minio_container(): + with get_minio_container(MinioContainerConfigs()) as container: + yield container @pytest.fixture(scope="class", autouse=True) -def setup_s3(request) -> None: +def setup_s3(minio_container) -> None: # Mock our S3 bucket and ingest a file - boto3.DEFAULT_SESSION = None - request.cls.s3_client = boto3.client( - "s3", - region_name="us-west-1", - ) - s3 = boto3.resource( - "s3", - region_name="us-west-1", - aws_access_key_id="fake_access_key", - aws_secret_access_key="fake_secret_key", - ) - request.cls.s3_client.create_bucket( - Bucket=BUCKET_NAME, - CreateBucketConfiguration={"LocationConstraint": "us-west-1"}, - ) - s3.meta.client.head_bucket(Bucket=BUCKET_NAME) + client = minio_container.get_client() + if client.bucket_exists(BUCKET_NAME): + return + client.make_bucket(BUCKET_NAME) current_dir = os.path.dirname(__file__) resources_dir = os.path.join(current_dir, "resources") @@ -161,23 +149,31 @@ def setup_s3(request) -> None: for path, _, files in os.walk(resources_dir) for filename in files ] - - request.cls.s3_keys = [] - for path in resources_paths: key = os.path.relpath(path, resources_dir) - request.cls.s3_keys.append(key) - request.cls.s3_client.upload_file(Filename=path, Bucket=BUCKET_NAME, Key=key) - yield - bucket = s3.Bucket(BUCKET_NAME) - for key in bucket.objects.all(): - key.delete() - bucket.delete() + client.fput_object(BUCKET_NAME, key, path) + return + + +@pytest.fixture(scope="class") +def ingestion_config(minio_container): + ingestion_config = deepcopy(INGESTION_CONFIG) + ingestion_config["source"]["serviceConnection"]["config"]["configSource"].update( + { + "securityConfig": { + "awsAccessKeyId": minio_container.access_key, + "awsSecretAccessKey": minio_container.secret_key, + "awsRegion": "us-west-1", + "endPointURL": f"http://localhost:{minio_container.get_exposed_port(minio_container.port)}", + } + } + ) + return ingestion_config @pytest.fixture(scope="class") -def run_ingestion(metadata): - ingestion_workflow = MetadataWorkflow.create(INGESTION_CONFIG) +def run_ingestion(metadata, ingestion_config): + ingestion_workflow = MetadataWorkflow.create(ingestion_config) ingestion_workflow.execute() ingestion_workflow.raise_from_status() ingestion_workflow.stop() @@ -188,28 +184,31 @@ def run_ingestion(metadata): metadata.delete(DatabaseService, db_service.id, recursive=True, hard_delete=True) -@pytest.fixture -def run_test_suite_workflow(run_ingestion): - ingestion_workflow = TestSuiteWorkflow.create(DATA_QUALITY_CONFIG) +@pytest.fixture(scope="class") +def run_test_suite_workflow(run_ingestion, ingestion_config): + workflow_config = deepcopy(DATA_QUALITY_CONFIG) + workflow_config["source"]["serviceConnection"] = ingestion_config["source"][ + "serviceConnection" + ] + ingestion_workflow = TestSuiteWorkflow.create(workflow_config) ingestion_workflow.execute() ingestion_workflow.raise_from_status() ingestion_workflow.stop() -@pytest.fixture(scope="session") -def profiler_workflow_config(workflow_config): - config = deepcopy(INGESTION_CONFIG) - config["source"]["sourceConfig"]["config"].update( +@pytest.fixture(scope="class") +def profiler_workflow_config(ingestion_config, workflow_config): + ingestion_config["source"]["sourceConfig"]["config"].update( { "type": "Profiler", } ) - config["processor"] = { + ingestion_config["processor"] = { "type": "orm-profiler", "config": {}, } - config["workflowConfig"] = workflow_config - return config + ingestion_config["workflowConfig"] = workflow_config + return ingestion_config @pytest.fixture() diff --git a/ingestion/tests/integration/orm_profiler/resources/profiler_test_.csv b/ingestion/tests/integration/datalake/resources/profiler_test_.csv similarity index 100% rename from ingestion/tests/integration/orm_profiler/resources/profiler_test_.csv rename to ingestion/tests/integration/datalake/resources/profiler_test_.csv diff --git a/ingestion/tests/integration/datalake/test_datalake_profiler_e2e.py b/ingestion/tests/integration/datalake/test_datalake_profiler_e2e.py new file mode 100644 index 000000000000..1a8244348a43 --- /dev/null +++ b/ingestion/tests/integration/datalake/test_datalake_profiler_e2e.py @@ -0,0 +1,311 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Test Datalake Profiler workflow + +To run this we need OpenMetadata server up and running. + +No sample data is required beforehand +""" +import pytest + +from ingestion.tests.integration.datalake.conftest import BUCKET_NAME +from metadata.generated.schema.entity.data.table import ColumnProfile, Table +from metadata.utils.time_utils import ( + get_beginning_of_day_timestamp_mill, + get_end_of_day_timestamp_mill, +) +from metadata.workflow.profiler import ProfilerWorkflow +from metadata.workflow.workflow_output_handler import WorkflowResultStatus + + +@pytest.fixture(scope="class", autouse=True) +def before_each(run_ingestion): + pass + + +class TestDatalakeProfilerTestE2E: + """datalake profiler E2E test""" + + def test_datalake_profiler_workflow(self, ingestion_config, metadata): + ingestion_config["source"]["sourceConfig"]["config"].update( + { + "type": "Profiler", + } + ) + ingestion_config["processor"] = { + "type": "orm-profiler", + "config": {}, + } + + profiler_workflow = ProfilerWorkflow.create(ingestion_config) + profiler_workflow.execute() + status = profiler_workflow.result_status() + profiler_workflow.stop() + + assert status == WorkflowResultStatus.SUCCESS + + table_profile = metadata.get_profile_data( + f'{ingestion_config["source"]["serviceName"]}.default.{BUCKET_NAME}."profiler_test_.csv"', + get_beginning_of_day_timestamp_mill(), + get_end_of_day_timestamp_mill(), + ) + + column_profile = metadata.get_profile_data( + f'{ingestion_config["source"]["serviceName"]}.default.{BUCKET_NAME}."profiler_test_.csv".first_name', + get_beginning_of_day_timestamp_mill(), + get_end_of_day_timestamp_mill(), + profile_type=ColumnProfile, + ) + + assert table_profile.entities + assert column_profile.entities + + def test_values_partitioned_datalake_profiler_workflow( + self, metadata, ingestion_config + ): + """Test partitioned datalake profiler workflow""" + ingestion_config["source"]["sourceConfig"]["config"].update( + { + "type": "Profiler", + } + ) + ingestion_config["processor"] = { + "type": "orm-profiler", + "config": { + "tableConfig": [ + { + "fullyQualifiedName": f'{ingestion_config["source"]["serviceName"]}.default.{BUCKET_NAME}."profiler_test_.csv"', + "partitionConfig": { + "enablePartitioning": "true", + "partitionColumnName": "first_name", + "partitionIntervalType": "COLUMN-VALUE", + "partitionValues": ["John"], + }, + } + ] + }, + } + + profiler_workflow = ProfilerWorkflow.create(ingestion_config) + profiler_workflow.execute() + status = profiler_workflow.result_status() + profiler_workflow.stop() + + assert status == WorkflowResultStatus.SUCCESS + + table = metadata.get_by_name( + entity=Table, + fqn=f'{ingestion_config["source"]["serviceName"]}.default.{BUCKET_NAME}."profiler_test_.csv"', + fields=["tableProfilerConfig"], + nullable=False, + ) + + profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile + + assert profile.rowCount == 1.0 + + def test_datetime_partitioned_datalake_profiler_workflow( + self, ingestion_config, metadata + ): + """Test partitioned datalake profiler workflow""" + ingestion_config["source"]["sourceConfig"]["config"].update( + { + "type": "Profiler", + } + ) + ingestion_config["processor"] = { + "type": "orm-profiler", + "config": { + "tableConfig": [ + { + "fullyQualifiedName": f'{ingestion_config["source"]["serviceName"]}.default.{BUCKET_NAME}."profiler_test_.csv"', + "partitionConfig": { + "enablePartitioning": "true", + "partitionColumnName": "birthdate", + "partitionIntervalType": "TIME-UNIT", + "partitionIntervalUnit": "YEAR", + "partitionInterval": 35, + }, + } + ], + }, + } + + profiler_workflow = ProfilerWorkflow.create(ingestion_config) + profiler_workflow.execute() + status = profiler_workflow.result_status() + profiler_workflow.stop() + + assert status == WorkflowResultStatus.SUCCESS + + table = metadata.get_by_name( + entity=Table, + fqn=f'{ingestion_config["source"]["serviceName"]}.default.{BUCKET_NAME}."profiler_test_.csv"', + fields=["tableProfilerConfig"], + ) + + profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile + + assert profile.rowCount == 2.0 + + def test_integer_range_partitioned_datalake_profiler_workflow( + self, ingestion_config, metadata + ): + """Test partitioned datalake profiler workflow""" + ingestion_config["source"]["sourceConfig"]["config"].update( + { + "type": "Profiler", + } + ) + ingestion_config["processor"] = { + "type": "orm-profiler", + "config": { + "tableConfig": [ + { + "fullyQualifiedName": f'{ingestion_config["source"]["serviceName"]}.default.{BUCKET_NAME}."profiler_test_.csv"', + "profileSample": 100, + "partitionConfig": { + "enablePartitioning": "true", + "partitionColumnName": "age", + "partitionIntervalType": "INTEGER-RANGE", + "partitionIntegerRangeStart": 35, + "partitionIntegerRangeEnd": 44, + }, + } + ], + }, + } + + profiler_workflow = ProfilerWorkflow.create(ingestion_config) + profiler_workflow.execute() + status = profiler_workflow.result_status() + profiler_workflow.stop() + + assert status == WorkflowResultStatus.SUCCESS + + table = metadata.get_by_name( + entity=Table, + fqn=f'{ingestion_config["source"]["serviceName"]}.default.{BUCKET_NAME}."profiler_test_.csv"', + fields=["tableProfilerConfig"], + ) + + profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile + + assert profile.rowCount == 2.0 + + def test_datalake_profiler_workflow_with_custom_profiler_config( + self, metadata, ingestion_config + ): + """Test custom profiler config return expected sample and metric computation""" + profiler_metrics = [ + "MIN", + "MAX", + "MEAN", + "MEDIAN", + ] + id_metrics = ["MIN", "MAX"] + non_metric_values = ["name", "timestamp"] + + ingestion_config["source"]["sourceConfig"]["config"].update( + { + "type": "Profiler", + } + ) + ingestion_config["processor"] = { + "type": "orm-profiler", + "config": { + "profiler": { + "name": "ingestion_profiler", + "metrics": profiler_metrics, + }, + "tableConfig": [ + { + "fullyQualifiedName": f'{ingestion_config["source"]["serviceName"]}.default.{BUCKET_NAME}."profiler_test_.csv"', + "columnConfig": { + "includeColumns": [ + {"columnName": "id", "metrics": id_metrics}, + {"columnName": "age"}, + ] + }, + } + ], + }, + } + + profiler_workflow = ProfilerWorkflow.create(ingestion_config) + profiler_workflow.execute() + status = profiler_workflow.result_status() + profiler_workflow.stop() + + assert status == WorkflowResultStatus.SUCCESS + + table = metadata.get_by_name( + entity=Table, + fqn=f'{ingestion_config["source"]["serviceName"]}.default.{BUCKET_NAME}."profiler_test_.csv"', + fields=["tableProfilerConfig"], + ) + + id_profile = metadata.get_profile_data( + f'{ingestion_config["source"]["serviceName"]}.default.{BUCKET_NAME}."profiler_test_.csv".id', + get_beginning_of_day_timestamp_mill(), + get_end_of_day_timestamp_mill(), + profile_type=ColumnProfile, + ).entities + + latest_id_profile = max(id_profile, key=lambda o: o.timestamp.root) + + id_metric_ln = 0 + for metric_name, metric in latest_id_profile: + if metric_name.upper() in id_metrics: + assert metric is not None + id_metric_ln += 1 + else: + assert metric is None if metric_name not in non_metric_values else True + + assert id_metric_ln == len(id_metrics) + + age_profile = metadata.get_profile_data( + f'{ingestion_config["source"]["serviceName"]}.default.{BUCKET_NAME}."profiler_test_.csv".age', + get_beginning_of_day_timestamp_mill(), + get_end_of_day_timestamp_mill(), + profile_type=ColumnProfile, + ).entities + + latest_age_profile = max(age_profile, key=lambda o: o.timestamp.root) + + age_metric_ln = 0 + for metric_name, metric in latest_age_profile: + if metric_name.upper() in profiler_metrics: + assert metric is not None + age_metric_ln += 1 + else: + assert metric is None if metric_name not in non_metric_values else True + + assert age_metric_ln == len(profiler_metrics) + + latest_exc_timestamp = latest_age_profile.timestamp.root + first_name_profile = metadata.get_profile_data( + f'{ingestion_config["source"]["serviceName"]}.default.{BUCKET_NAME}."profiler_test_.csv".first_name_profile', + get_beginning_of_day_timestamp_mill(), + get_end_of_day_timestamp_mill(), + profile_type=ColumnProfile, + ).entities + + assert not [ + p for p in first_name_profile if p.timestamp.root == latest_exc_timestamp + ] + + sample_data = metadata.get_sample_data(table) + assert sorted([c.root for c in sample_data.sampleData.columns]) == sorted( + ["id", "age"] + ) diff --git a/ingestion/tests/integration/datalake/test_ingestion.py b/ingestion/tests/integration/datalake/test_ingestion.py index 58c1847fee07..1bfff44f1c7c 100644 --- a/ingestion/tests/integration/datalake/test_ingestion.py +++ b/ingestion/tests/integration/datalake/test_ingestion.py @@ -13,6 +13,7 @@ import pytest +from ingestion.tests.integration.datalake.conftest import BUCKET_NAME from metadata.generated.schema.entity.data.table import DataType, Table from metadata.ingestion.ometa.models import EntityList from metadata.ingestion.ometa.ometa_api import OpenMetadata @@ -37,11 +38,15 @@ def test_ingestion(self, run_ingestion): ) # type: ignore entities = resp.entities - assert len(entities) == 4 + assert len(entities) == 5 names = [entity.name.root for entity in entities] - assert {"names.json", "names.jsonl", "new_users.parquet", "users.csv"} == set( - names - ) + assert { + "names.json", + "names.jsonl", + "new_users.parquet", + "users.csv", + "profiler_test_.csv", + } == set(names) for entity in entities: columns = entity.columns @@ -53,7 +58,7 @@ def test_profiler(self, run_profiler): """Also excluding the test for parquet files until the above is fixed""" csv_ = self.metadata.get_by_name( entity=Table, - fqn='datalake_for_integration_tests.default.MyBucket."users.csv"', + fqn=f'datalake_for_integration_tests.default.{BUCKET_NAME}."users.csv"', fields=["tableProfilerConfig"], ) # parquet_ = self.metadata.get_by_name( @@ -63,13 +68,13 @@ def test_profiler(self, run_profiler): # ) json_ = self.metadata.get_by_name( entity=Table, - fqn='datalake_for_integration_tests.default.MyBucket."names.json"', + fqn=f'datalake_for_integration_tests.default.{BUCKET_NAME}."names.json"', fields=["tableProfilerConfig"], ) jsonl_ = self.metadata.get_by_name( entity=Table, - fqn='datalake_for_integration_tests.default.MyBucket."names.jsonl"', + fqn=f'datalake_for_integration_tests.default.{BUCKET_NAME}."names.jsonl"', fields=["tableProfilerConfig"], ) diff --git a/ingestion/tests/integration/orm_profiler/test_datalake_profiler_e2e.py b/ingestion/tests/integration/orm_profiler/test_datalake_profiler_e2e.py deleted file mode 100644 index 7106be330ee0..000000000000 --- a/ingestion/tests/integration/orm_profiler/test_datalake_profiler_e2e.py +++ /dev/null @@ -1,440 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Test Datalake Profiler workflow - -To run this we need OpenMetadata server up and running. - -No sample data is required beforehand -""" - -import os -from copy import deepcopy -from pathlib import Path -from unittest import TestCase - -import boto3 -import botocore -from moto import mock_aws - -from metadata.generated.schema.entity.data.table import ColumnProfile, Table -from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( - OpenMetadataConnection, -) -from metadata.generated.schema.entity.services.databaseService import DatabaseService -from metadata.generated.schema.security.client.openMetadataJWTClientConfig import ( - OpenMetadataJWTClientConfig, -) -from metadata.ingestion.ometa.ometa_api import OpenMetadata -from metadata.utils.time_utils import ( - get_beginning_of_day_timestamp_mill, - get_end_of_day_timestamp_mill, -) -from metadata.workflow.metadata import MetadataWorkflow -from metadata.workflow.profiler import ProfilerWorkflow -from metadata.workflow.workflow_output_handler import WorkflowResultStatus - -SERVICE_NAME = Path(__file__).stem -REGION = "us-west-1" -BUCKET_NAME = "MyBucket" -INGESTION_CONFIG = { - "source": { - "type": "datalake", - "serviceName": SERVICE_NAME, - "serviceConnection": { - "config": { - "type": "Datalake", - "configSource": { - "securityConfig": { - "awsAccessKeyId": "fake_access_key", - "awsSecretAccessKey": "fake_secret_key", - "awsRegion": REGION, - } - }, - "bucketName": f"{BUCKET_NAME}", - } - }, - "sourceConfig": {"config": {"type": "DatabaseMetadata"}}, - }, - "sink": {"type": "metadata-rest", "config": {}}, - "workflowConfig": { - "openMetadataServerConfig": { - "hostPort": "http://localhost:8585/api", - "authProvider": "openmetadata", - "securityConfig": { - "jwtToken": "eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg" - }, - } - }, -} - - -@mock_aws -class DatalakeProfilerTestE2E(TestCase): - """datalake profiler E2E test""" - - @classmethod - def setUpClass(cls) -> None: - server_config = OpenMetadataConnection( - hostPort="http://localhost:8585/api", - authProvider="openmetadata", - securityConfig=OpenMetadataJWTClientConfig( - jwtToken="eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg" - ), - ) # type: ignore - cls.metadata = OpenMetadata(server_config) - - def setUp(self) -> None: - # Mock our S3 bucket and ingest a file - boto3.DEFAULT_SESSION = None - self.client = boto3.client( - "s3", - region_name=REGION, - ) - - # check that we are not running our test against a real bucket - try: - s3 = boto3.resource( - "s3", - region_name=REGION, - aws_access_key_id="fake_access_key", - aws_secret_access_key="fake_secret_key", - ) - s3.meta.client.head_bucket(Bucket=BUCKET_NAME) - except botocore.exceptions.ClientError: - pass - else: - err = f"{BUCKET_NAME} should not exist." - raise EnvironmentError(err) - self.client.create_bucket( - Bucket=BUCKET_NAME, - CreateBucketConfiguration={"LocationConstraint": REGION}, - ) - current_dir = os.path.dirname(__file__) - resources_dir = os.path.join(current_dir, "resources") - - resources_paths = [ - os.path.join(path, filename) - for path, _, files in os.walk(resources_dir) - for filename in files - ] - - self.s3_keys = [] - - for path in resources_paths: - key = os.path.relpath(path, resources_dir) - self.s3_keys.append(key) - self.client.upload_file(Filename=path, Bucket=BUCKET_NAME, Key=key) - - # Ingest our S3 data - ingestion_workflow = MetadataWorkflow.create(INGESTION_CONFIG) - ingestion_workflow.execute() - ingestion_workflow.raise_from_status() - ingestion_workflow.print_status() - ingestion_workflow.stop() - - def test_datalake_profiler_workflow(self): - workflow_config = deepcopy(INGESTION_CONFIG) - workflow_config["source"]["sourceConfig"]["config"].update( - { - "type": "Profiler", - } - ) - workflow_config["processor"] = { - "type": "orm-profiler", - "config": {}, - } - - profiler_workflow = ProfilerWorkflow.create(workflow_config) - profiler_workflow.execute() - status = profiler_workflow.result_status() - profiler_workflow.stop() - - assert status == WorkflowResultStatus.SUCCESS - - table_profile = self.metadata.get_profile_data( - f'{SERVICE_NAME}.default.MyBucket."profiler_test_.csv"', - get_beginning_of_day_timestamp_mill(), - get_end_of_day_timestamp_mill(), - ) - - column_profile = self.metadata.get_profile_data( - f'{SERVICE_NAME}.default.MyBucket."profiler_test_.csv".first_name', - get_beginning_of_day_timestamp_mill(), - get_end_of_day_timestamp_mill(), - profile_type=ColumnProfile, - ) - - assert table_profile.entities - assert column_profile.entities - - def test_values_partitioned_datalake_profiler_workflow(self): - """Test partitioned datalake profiler workflow""" - workflow_config = deepcopy(INGESTION_CONFIG) - workflow_config["source"]["sourceConfig"]["config"].update( - { - "type": "Profiler", - } - ) - workflow_config["processor"] = { - "type": "orm-profiler", - "config": { - "tableConfig": [ - { - "fullyQualifiedName": f'{SERVICE_NAME}.default.MyBucket."profiler_test_.csv"', - "partitionConfig": { - "enablePartitioning": "true", - "partitionColumnName": "first_name", - "partitionIntervalType": "COLUMN-VALUE", - "partitionValues": ["John"], - }, - } - ] - }, - } - - profiler_workflow = ProfilerWorkflow.create(workflow_config) - profiler_workflow.execute() - status = profiler_workflow.result_status() - profiler_workflow.stop() - - assert status == WorkflowResultStatus.SUCCESS - - table = self.metadata.get_by_name( - entity=Table, - fqn=f'{SERVICE_NAME}.default.MyBucket."profiler_test_.csv"', - fields=["tableProfilerConfig"], - ) - - profile = self.metadata.get_latest_table_profile( - table.fullyQualifiedName - ).profile - - assert profile.rowCount == 1.0 - - def test_datetime_partitioned_datalake_profiler_workflow(self): - """Test partitioned datalake profiler workflow""" - workflow_config = deepcopy(INGESTION_CONFIG) - workflow_config["source"]["sourceConfig"]["config"].update( - { - "type": "Profiler", - } - ) - workflow_config["processor"] = { - "type": "orm-profiler", - "config": { - "tableConfig": [ - { - "fullyQualifiedName": f'{SERVICE_NAME}.default.MyBucket."profiler_test_.csv"', - "partitionConfig": { - "enablePartitioning": "true", - "partitionColumnName": "birthdate", - "partitionIntervalType": "TIME-UNIT", - "partitionIntervalUnit": "YEAR", - "partitionInterval": 35, - }, - } - ], - }, - } - - profiler_workflow = ProfilerWorkflow.create(workflow_config) - profiler_workflow.execute() - status = profiler_workflow.result_status() - profiler_workflow.stop() - - assert status == WorkflowResultStatus.SUCCESS - - table = self.metadata.get_by_name( - entity=Table, - fqn=f'{SERVICE_NAME}.default.MyBucket."profiler_test_.csv"', - fields=["tableProfilerConfig"], - ) - - profile = self.metadata.get_latest_table_profile( - table.fullyQualifiedName - ).profile - - assert profile.rowCount == 2.0 - - def test_integer_range_partitioned_datalake_profiler_workflow(self): - """Test partitioned datalake profiler workflow""" - workflow_config = deepcopy(INGESTION_CONFIG) - workflow_config["source"]["sourceConfig"]["config"].update( - { - "type": "Profiler", - } - ) - workflow_config["processor"] = { - "type": "orm-profiler", - "config": { - "tableConfig": [ - { - "fullyQualifiedName": f'{SERVICE_NAME}.default.MyBucket."profiler_test_.csv"', - "profileSample": 100, - "partitionConfig": { - "enablePartitioning": "true", - "partitionColumnName": "age", - "partitionIntervalType": "INTEGER-RANGE", - "partitionIntegerRangeStart": 35, - "partitionIntegerRangeEnd": 44, - }, - } - ], - }, - } - - profiler_workflow = ProfilerWorkflow.create(workflow_config) - profiler_workflow.execute() - status = profiler_workflow.result_status() - profiler_workflow.stop() - - assert status == WorkflowResultStatus.SUCCESS - - table = self.metadata.get_by_name( - entity=Table, - fqn=f'{SERVICE_NAME}.default.MyBucket."profiler_test_.csv"', - fields=["tableProfilerConfig"], - ) - - profile = self.metadata.get_latest_table_profile( - table.fullyQualifiedName - ).profile - - assert profile.rowCount == 2.0 - - def test_datalake_profiler_workflow_with_custom_profiler_config(self): - """Test custom profiler config return expected sample and metric computation""" - profiler_metrics = [ - "MIN", - "MAX", - "MEAN", - "MEDIAN", - ] - id_metrics = ["MIN", "MAX"] - non_metric_values = ["name", "timestamp"] - - workflow_config = deepcopy(INGESTION_CONFIG) - workflow_config["source"]["sourceConfig"]["config"].update( - { - "type": "Profiler", - } - ) - workflow_config["processor"] = { - "type": "orm-profiler", - "config": { - "profiler": { - "name": "ingestion_profiler", - "metrics": profiler_metrics, - }, - "tableConfig": [ - { - "fullyQualifiedName": f'{SERVICE_NAME}.default.MyBucket."profiler_test_.csv"', - "columnConfig": { - "includeColumns": [ - {"columnName": "id", "metrics": id_metrics}, - {"columnName": "age"}, - ] - }, - } - ], - }, - } - - profiler_workflow = ProfilerWorkflow.create(workflow_config) - profiler_workflow.execute() - status = profiler_workflow.result_status() - profiler_workflow.stop() - - assert status == WorkflowResultStatus.SUCCESS - - table = self.metadata.get_by_name( - entity=Table, - fqn=f'{SERVICE_NAME}.default.MyBucket."profiler_test_.csv"', - fields=["tableProfilerConfig"], - ) - - id_profile = self.metadata.get_profile_data( - f'{SERVICE_NAME}.default.MyBucket."profiler_test_.csv".id', - get_beginning_of_day_timestamp_mill(), - get_end_of_day_timestamp_mill(), - profile_type=ColumnProfile, - ).entities - - latest_id_profile = max(id_profile, key=lambda o: o.timestamp.root) - - id_metric_ln = 0 - for metric_name, metric in latest_id_profile: - if metric_name.upper() in id_metrics: - assert metric is not None - id_metric_ln += 1 - else: - assert metric is None if metric_name not in non_metric_values else True - - assert id_metric_ln == len(id_metrics) - - age_profile = self.metadata.get_profile_data( - f'{SERVICE_NAME}.default.MyBucket."profiler_test_.csv".age', - get_beginning_of_day_timestamp_mill(), - get_end_of_day_timestamp_mill(), - profile_type=ColumnProfile, - ).entities - - latest_age_profile = max(age_profile, key=lambda o: o.timestamp.root) - - age_metric_ln = 0 - for metric_name, metric in latest_age_profile: - if metric_name.upper() in profiler_metrics: - assert metric is not None - age_metric_ln += 1 - else: - assert metric is None if metric_name not in non_metric_values else True - - assert age_metric_ln == len(profiler_metrics) - - latest_exc_timestamp = latest_age_profile.timestamp.root - first_name_profile = self.metadata.get_profile_data( - f'{SERVICE_NAME}.default.MyBucket."profiler_test_.csv".first_name_profile', - get_beginning_of_day_timestamp_mill(), - get_end_of_day_timestamp_mill(), - profile_type=ColumnProfile, - ).entities - - assert not [ - p for p in first_name_profile if p.timestamp.root == latest_exc_timestamp - ] - - sample_data = self.metadata.get_sample_data(table) - assert sorted([c.root for c in sample_data.sampleData.columns]) == sorted( - ["id", "age"] - ) - - def tearDown(self): - s3 = boto3.resource( - "s3", - region_name=REGION, - ) - bucket = s3.Bucket(BUCKET_NAME) - for key in bucket.objects.all(): - key.delete() - bucket.delete() - - service_id = str( - self.metadata.get_by_name(entity=DatabaseService, fqn=SERVICE_NAME).id.root - ) - - self.metadata.delete( - entity=DatabaseService, - entity_id=service_id, - recursive=True, - hard_delete=True, - ) diff --git a/ingestion/tests/integration/sources/database/delta_lake/conftest.py b/ingestion/tests/integration/sources/database/delta_lake/conftest.py index 3fc5ac318910..a11a33076c92 100644 --- a/ingestion/tests/integration/sources/database/delta_lake/conftest.py +++ b/ingestion/tests/integration/sources/database/delta_lake/conftest.py @@ -36,7 +36,7 @@ def with_exposed_port(self, minio): ] = f"http://localhost:{self.minio_config.exposed_port}" -@pytest.fixture(scope="session") +@pytest.fixture(scope="module") def deltalake_storage_environment(): config = DeltaLakeStorageTestConfig() minio = get_minio_container(config.minio_config) diff --git a/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py b/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py index 0270750c702f..cac724d17e1a 100644 --- a/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py +++ b/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py @@ -18,10 +18,7 @@ from unittest.mock import patch from uuid import uuid4 -import boto3 -import botocore import pandas as pd -from moto import mock_aws from metadata.generated.schema.entity.data.table import Column as EntityColumn from metadata.generated.schema.entity.data.table import ColumnName, DataType, Table @@ -43,7 +40,6 @@ REGION = "us-west-1" -@mock_aws class MetricsTest(TestCase): """ Run checks on different metrics @@ -103,45 +99,6 @@ class MetricsTest(TestCase): ) def setUp(self): - # Mock our S3 bucket and ingest a file - boto3.DEFAULT_SESSION = None - self.client = boto3.client( - "s3", - region_name=REGION, - ) - - # check that we are not running our test against a real bucket - try: - s3 = boto3.resource( - "s3", - region_name=REGION, - aws_access_key_id="fake_access_key", - aws_secret_access_key="fake_secret_key", - ) - s3.meta.client.head_bucket(Bucket=BUCKET_NAME) - except botocore.exceptions.ClientError: - pass - else: - err = f"{BUCKET_NAME} should not exist." - raise EnvironmentError(err) - self.client.create_bucket( - Bucket=BUCKET_NAME, - CreateBucketConfiguration={"LocationConstraint": REGION}, - ) - - resources_paths = [ - os.path.join(path, filename) - for path, _, files in os.walk(self.resources_dir) - for filename in files - ] - - self.s3_keys = [] - - for path in resources_paths: - key = os.path.relpath(path, self.resources_dir) - self.s3_keys.append(key) - self.client.upload_file(Filename=path, Bucket=BUCKET_NAME, Key=key) - with patch( "metadata.mixins.pandas.pandas_mixin.fetch_dataframe", return_value=self.dfs,