From 0d66c0e51435895c1c95d1a793e48cb35dead09c Mon Sep 17 00:00:00 2001 From: Ayush Shah Date: Wed, 31 Jan 2024 20:43:04 +0530 Subject: [PATCH] Minor: Add extended glossary (#14918) * Add Nested glossary to extended sample data * Fix lint * Add Stress test config * Fix Lint --- ingestion/pipelines/extended_sample_data.yaml | 2 + .../source/database/extended_sample_data.py | 336 +++++++++++------- 2 files changed, 204 insertions(+), 134 deletions(-) diff --git a/ingestion/pipelines/extended_sample_data.yaml b/ingestion/pipelines/extended_sample_data.yaml index bed3bb3b7eee..47d5c5bdd7a6 100644 --- a/ingestion/pipelines/extended_sample_data.yaml +++ b/ingestion/pipelines/extended_sample_data.yaml @@ -9,6 +9,8 @@ source: connectionOptions: sampleDataFolder: "./examples/sample_data" extendedSampleDataFolder: "./examples/extended_sample_data" + includeGlossary: True + includeLineageStressTesting: True sourceConfig: {} sink: type: metadata-rest diff --git a/ingestion/src/metadata/ingestion/source/database/extended_sample_data.py b/ingestion/src/metadata/ingestion/source/database/extended_sample_data.py index b2892a184e5a..5e84f04e0d10 100644 --- a/ingestion/src/metadata/ingestion/source/database/extended_sample_data.py +++ b/ingestion/src/metadata/ingestion/source/database/extended_sample_data.py @@ -24,6 +24,10 @@ from metadata.generated.schema.api.data.createDatabaseSchema import ( CreateDatabaseSchemaRequest, ) +from metadata.generated.schema.api.data.createGlossary import CreateGlossaryRequest +from metadata.generated.schema.api.data.createGlossaryTerm import ( + CreateGlossaryTermRequest, +) from metadata.generated.schema.api.data.createTable import CreateTableRequest from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest from metadata.generated.schema.entity.data.dashboard import Dashboard @@ -91,10 +95,20 @@ def __init__(self, config: WorkflowSource, metadata: OpenMetadata): self.store_table_fqn = set() self.store_data_model_fqn = [] self.store_dashboard_fqn = [] + self.main_glossary = None + self.glossary_term_list = [] sample_data_folder = self.service_connection.connectionOptions.__root__.get( "sampleDataFolder" ) + self.include_glossary = self.service_connection.connectionOptions.__root__.get( + "includeGlossary" + ) + self.include_lineage_stress_testing = ( + self.service_connection.connectionOptions.__root__.get( + "includeLineageStressTesting" + ) + ) extneded_sample_data_folder = ( self.service_connection.connectionOptions.__root__.get( "extendedSampleDataFolder" @@ -225,43 +239,71 @@ def create_depth_nodes(self, from_table, to_table): ) ) - def generate_sample_data(self): # pylint: disable=too-many-locals + def generate_sample_data( + self, + ): # pylint: disable=too-many-locals,too-many-statements """ Generate sample data for dashboard and database service, with lineage between them, having long names, special characters and description """ - - db = self.create_database_request("extended_sample_data", self.generate_text()) - yield Either(right=db) - - schema = self.create_database_schema_request( - "extended_sample_database_schema", self.generate_text(), db - ) - yield Either(right=schema) - for table in self.extended_tables["tables"]: - text = self.generate_text() - table_request = self.create_table_request( - table["name"], text, schema, table - ) - yield Either(right=table_request) - downstream_node_fqn_table = fqn.build( - self.metadata, - entity_type=Table, - service_name=self.database_service.name.__root__, - database_name=db.name.__root__, - schema_name=schema.name.__root__, - table_name=table_request.name.__root__, + if self.include_glossary: + yield from self.create_glossary() + yield from self.create_glossary_term() + if self.include_lineage_stress_testing: + db = self.create_database_request( + "extended_sample_data", self.generate_text() ) - to_table = self.metadata.get_by_name( - entity=Table, fqn=downstream_node_fqn_table - ) - main_table = to_table - self.store_table_fqn.add(downstream_node_fqn_table) + yield Either(right=db) + schema = self.create_database_schema_request( + "extended_sample_database_schema", self.generate_text(), db + ) + yield Either(right=schema) + for table in self.extended_tables["tables"]: + text = self.generate_text() + table_request = self.create_table_request( + table["name"], text, schema, table + ) + yield Either(right=table_request) + downstream_node_fqn_table = fqn.build( + self.metadata, + entity_type=Table, + service_name=self.database_service.name.__root__, + database_name=db.name.__root__, + schema_name=schema.name.__root__, + table_name=table_request.name.__root__, + ) + to_table = self.metadata.get_by_name( + entity=Table, fqn=downstream_node_fqn_table + ) + main_table = to_table + self.store_table_fqn.add(downstream_node_fqn_table) + + for _ in range(40): + # 40 Dynamic Lineage Depths + name = self.generate_name() + table_request = self.create_table_request(name, text, schema, table) + yield Either(right=table_request) + upstream_node_fqn_table = fqn.build( + self.metadata, + entity_type=Table, + service_name=self.database_service.name.__root__, + database_name=db.name.__root__, + schema_name=schema.name.__root__, + table_name=table_request.name.__root__, + ) + from_table = self.metadata.get_by_name( + entity=Table, fqn=upstream_node_fqn_table + ) + yield from self.create_depth_nodes( + from_table=from_table, to_table=to_table + ) + to_table = from_table for _ in range(40): - # 40 Dynamic Lineage Depths name = self.generate_name() - table_request = self.create_table_request(name, text, schema, table) + table_request = self.create_table_request( + name, text, schema, self.extended_tables["tables"][-1] + ) yield Either(right=table_request) upstream_node_fqn_table = fqn.build( self.metadata, @@ -275,127 +317,75 @@ def generate_sample_data(self): # pylint: disable=too-many-locals entity=Table, fqn=upstream_node_fqn_table ) yield from self.create_depth_nodes( - from_table=from_table, to_table=to_table + from_table=from_table, to_table=main_table ) - to_table = from_table - for _ in range(40): - name = self.generate_name() - table_request = self.create_table_request( - name, text, schema, self.extended_tables["tables"][-1] - ) - yield Either(right=table_request) - upstream_node_fqn_table = fqn.build( - self.metadata, - entity_type=Table, - service_name=self.database_service.name.__root__, - database_name=db.name.__root__, - schema_name=schema.name.__root__, - table_name=table_request.name.__root__, - ) - from_table = self.metadata.get_by_name( - entity=Table, fqn=upstream_node_fqn_table - ) - yield from self.create_depth_nodes( - from_table=from_table, to_table=main_table - ) - for table in self.tables["tables"]: - text = self.generate_text() - table_request = self.create_table_request( - table["name"], text, schema, table - ) - yield Either(right=table_request) - table_entity_fqn = fqn.build( - self.metadata, - entity_type=Table, - service_name=self.database_service.name.__root__, - database_name=db.name.__root__, - schema_name=schema.name.__root__, - table_name=table_request.name.__root__, - ) - from_table = self.metadata.get_by_name(entity=Table, fqn=table_entity_fqn) - - yield from self.create_depth_nodes(from_table=from_table, to_table=to_table) - - self.dashboard_service_json["name"] = name - self.dashboard_service_json["description"] = text - for chart in self.charts["charts"]: - yield Either( - right=CreateChartRequest( - name=chart["name"], - displayName=chart["displayName"], - description=chart["description"], - chartType=get_standard_chart_type(chart["chartType"]), - sourceUrl=chart["sourceUrl"], - service=self.dashboard_service.fullyQualifiedName, + for table in self.tables["tables"]: + text = self.generate_text() + table_request = self.create_table_request( + table["name"], text, schema, table + ) + yield Either(right=table_request) + table_entity_fqn = fqn.build( + self.metadata, + entity_type=Table, + service_name=self.database_service.name.__root__, + database_name=db.name.__root__, + schema_name=schema.name.__root__, + table_name=table_request.name.__root__, + ) + from_table = self.metadata.get_by_name( + entity=Table, fqn=table_entity_fqn ) - ) - for data_model in self.data_models["datamodels"]: - name = self.generate_name() - text = self.generate_text() - data_model_request = self.create_dashboard_data_model_request( - name, text, data_model - ) - yield Either(right=data_model_request) - data_model_entity_fqn = fqn.build( - self.metadata, - entity_type=DashboardDataModel, - service_name=self.dashboard_service.name.__root__, - data_model_name=data_model_request.name.__root__, - ) - self.store_data_model_fqn.append(data_model_entity_fqn) - for table_fqn in self.store_table_fqn: - from_table = self.metadata.get_by_name(entity=Table, fqn=table_fqn) - for dashboard_datamodel_fqn in self.store_data_model_fqn: - to_datamodel = self.metadata.get_by_name( - entity=DashboardDataModel, fqn=dashboard_datamodel_fqn + yield from self.create_depth_nodes( + from_table=from_table, to_table=to_table ) + self.dashboard_service_json["name"] = name + self.dashboard_service_json["description"] = text + for chart in self.charts["charts"]: yield Either( - right=AddLineageRequest( - edge=EntitiesEdge( - fromEntity=EntityReference( - id=from_table.id.__root__, type="table" - ), - toEntity=EntityReference( - id=to_datamodel.id.__root__, - type="dashboardDataModel", - ), - lineageDetails=LineageDetails( - source=LineageSource.DashboardLineage - ), - ) - ) - ) - for dashboard in self.dashboards["dashboards"]: - dashboard_request = CreateDashboardRequest( - name=dashboard["name"], - displayName=dashboard["displayName"], - description=dashboard["description"], - sourceUrl=dashboard["sourceUrl"], - charts=dashboard["charts"], + right=CreateChartRequest( + name=chart["name"], + displayName=chart["displayName"], + description=chart["description"], + chartType=get_standard_chart_type(chart["chartType"]), + sourceUrl=chart["sourceUrl"], service=self.dashboard_service.fullyQualifiedName, ) - yield Either(right=dashboard_request) - dashboard_fqn = fqn.build( - self.metadata, - entity_type=Dashboard, - service_name=self.dashboard_service.name.__root__, - dashboard_name=dashboard_request.name.__root__, - ) - to_dashboard = self.metadata.get_by_name( - entity=Dashboard, fqn=dashboard_fqn + ) + for data_model in self.data_models["datamodels"]: + name = self.generate_name() + text = self.generate_text() + data_model_request = self.create_dashboard_data_model_request( + name, text, data_model + ) + yield Either(right=data_model_request) + data_model_entity_fqn = fqn.build( + self.metadata, + entity_type=DashboardDataModel, + service_name=self.dashboard_service.name.__root__, + data_model_name=data_model_request.name.__root__, + ) + self.store_data_model_fqn.append(data_model_entity_fqn) + + for table_fqn in self.store_table_fqn: + from_table = self.metadata.get_by_name(entity=Table, fqn=table_fqn) + for dashboard_datamodel_fqn in self.store_data_model_fqn: + to_datamodel = self.metadata.get_by_name( + entity=DashboardDataModel, fqn=dashboard_datamodel_fqn ) + yield Either( right=AddLineageRequest( edge=EntitiesEdge( fromEntity=EntityReference( - id=to_datamodel.id.__root__, - type="dashboardDataModel", + id=from_table.id.__root__, type="table" ), toEntity=EntityReference( - id=to_dashboard.id.__root__, type="dashboard" + id=to_datamodel.id.__root__, + type="dashboardDataModel", ), lineageDetails=LineageDetails( source=LineageSource.DashboardLineage @@ -403,6 +393,84 @@ def generate_sample_data(self): # pylint: disable=too-many-locals ) ) ) + for dashboard in self.dashboards["dashboards"]: + dashboard_request = CreateDashboardRequest( + name=dashboard["name"], + displayName=dashboard["displayName"], + description=dashboard["description"], + sourceUrl=dashboard["sourceUrl"], + charts=dashboard["charts"], + service=self.dashboard_service.fullyQualifiedName, + ) + yield Either(right=dashboard_request) + dashboard_fqn = fqn.build( + self.metadata, + entity_type=Dashboard, + service_name=self.dashboard_service.name.__root__, + dashboard_name=dashboard_request.name.__root__, + ) + to_dashboard = self.metadata.get_by_name( + entity=Dashboard, fqn=dashboard_fqn + ) + yield Either( + right=AddLineageRequest( + edge=EntitiesEdge( + fromEntity=EntityReference( + id=to_datamodel.id.__root__, + type="dashboardDataModel", + ), + toEntity=EntityReference( + id=to_dashboard.id.__root__, type="dashboard" + ), + lineageDetails=LineageDetails( + source=LineageSource.DashboardLineage + ), + ) + ) + ) + + def create_glossary(self): + self.main_glossary = CreateGlossaryRequest( + name="NestedGlossaryTest", + displayName="NestedGlossaryTest", + description="Description of test glossary", + ) + yield Either(right=self.main_glossary) + + def create_glossary_term(self): + """ + Create Glossary Terms + """ + for _ in range(20): + random_name = self.fake.first_name() + yield Either( + right=CreateGlossaryTermRequest( + glossary="NestedGlossaryTest", + name=random_name, + displayName=random_name, + description="Test glossary term ", + parent=self.glossary_term_list[-1] + if len(self.glossary_term_list) > 3 + else None, + ) + ) + if len(self.glossary_term_list) > 3 and self.glossary_term_list[-1]: + self.glossary_term_list.append( + f"{self.glossary_term_list[-1]}.{random_name}" + ) + else: + self.glossary_term_list.append(f"NestedGlossaryTest.{random_name}") + + for _ in range(500): + random_name = self.fake.first_name() + yield Either( + right=CreateGlossaryTermRequest( + glossary="NestedGlossaryTest", + name=random_name, + displayName=random_name, + description="Test glossary term 1", + ) + ) def generate_name(self): return f"Sample-@!3_(%t3st@)%_^{self.fake.name()}"