diff --git a/ingestion/setup.py b/ingestion/setup.py index c8082e0e1654..560c62625419 100644 --- a/ingestion/setup.py +++ b/ingestion/setup.py @@ -258,6 +258,7 @@ VERSIONS["lkml"], "gitpython~=3.1.34", VERSIONS["giturlparse"], + "python-liquid", }, "mlflow": {"mlflow-skinny>=2.3.0"}, "mongo": {VERSIONS["mongo"], VERSIONS["pandas"], VERSIONS["numpy"]}, diff --git a/ingestion/src/metadata/ingestion/source/dashboard/looker/metadata.py b/ingestion/src/metadata/ingestion/source/dashboard/looker/metadata.py index 89bb367a53f0..4cf1858227f9 100644 --- a/ingestion/src/metadata/ingestion/source/dashboard/looker/metadata.py +++ b/ingestion/src/metadata/ingestion/source/dashboard/looker/metadata.py @@ -28,6 +28,7 @@ import giturlparse import lkml +from liquid import Template from looker_sdk.sdk.api40.methods import Looker40SDK from looker_sdk.sdk.api40.models import Dashboard as LookerDashboard from looker_sdk.sdk.api40.models import ( @@ -450,11 +451,11 @@ def yield_bulk_datamodel( view.name, "Data model (View) filtered out." ) continue - + view_name = view.from_ if view.from_ else view.name yield from self._process_view( - view_name=ViewName(view.name), explore=model + view_name=ViewName(view_name), explore=model ) - if len(model.joins) == 0 and model.sql_table_name: + if model.view_name: yield from self._process_view( view_name=ViewName(model.view_name), explore=model ) @@ -570,7 +571,8 @@ def add_view_lineage( db_service_names = self.get_db_service_names() if view.sql_table_name: - source_table_name = self._clean_table_name(view.sql_table_name) + sql_table_name = self._render_table_name(view.sql_table_name) + source_table_name = self._clean_table_name(sql_table_name) # View to the source is only there if we are informing the dbServiceNames for db_service_name in db_service_names or []: @@ -726,6 +728,33 @@ def _clean_table_name(table_name: str) -> str: return table_name.lower().split(" as ")[0].strip() + @staticmethod + def _render_table_name(table_name: str) -> str: + """ + sql_table_names might contain Liquid templates + when defining an explore. e.g,: + sql_table_name: + {% if openmetadata %} + event + {% elsif event.created_week._in_query %} + event_by_week + {% else %} + event + {% endif %} ;; + we should render the template and give the option + to render a specific value during metadata ingestion + using the "openmetadata" context argument + :param table_name: table name with possible templating + :return: rendered table name + """ + try: + context = {"openmetadata": True} + template = Template(table_name) + sql_table_name = template.render(context) + except Exception: + sql_table_name = table_name + return sql_table_name + @staticmethod def get_dashboard_sources(dashboard_details: LookerDashboard) -> Set[str]: """ diff --git a/ingestion/src/metadata/profiler/metrics/composed/null_ratio.py b/ingestion/src/metadata/profiler/metrics/composed/null_ratio.py index cdc809eeb3f6..74c4c914902e 100644 --- a/ingestion/src/metadata/profiler/metrics/composed/null_ratio.py +++ b/ingestion/src/metadata/profiler/metrics/composed/null_ratio.py @@ -50,8 +50,9 @@ def fn(self, res: Dict[str, Any]) -> Optional[float]: results of other Metrics """ - count = res.get(Count.name()) - null_count = res.get(NullCount.name()) - if count + null_count == 0: + count = res.get(Count.name(), 0) + null_count = res.get(NullCount.name(), 0) + total = count + null_count + if total == 0: return None - return null_count / (null_count + count) + return null_count / total diff --git a/ingestion/src/metadata/profiler/orm/functions/table_metric_computer.py b/ingestion/src/metadata/profiler/orm/functions/table_metric_computer.py index 17d0d84b3968..5566600c61d6 100644 --- a/ingestion/src/metadata/profiler/orm/functions/table_metric_computer.py +++ b/ingestion/src/metadata/profiler/orm/functions/table_metric_computer.py @@ -176,6 +176,8 @@ def compute(self): ) rest = self._runner._session.execute(query).first() + if not rest: + return None if rest.rowCount is None: # if we don't have any row count, fallback to the base logic return super().compute() diff --git a/ingestion/src/metadata/profiler/orm/types/custom_hex_byte_string.py b/ingestion/src/metadata/profiler/orm/types/custom_hex_byte_string.py index e87afd234e88..58fcfb6d94ff 100644 --- a/ingestion/src/metadata/profiler/orm/types/custom_hex_byte_string.py +++ b/ingestion/src/metadata/profiler/orm/types/custom_hex_byte_string.py @@ -22,6 +22,7 @@ from metadata.utils.logger import ingestion_logger logger = ingestion_logger() +NULL_BYTE = "\x00" class HexByteString(TypeDecorator): @@ -63,10 +64,22 @@ def process_result_value(self, value: Optional[bytes], dialect) -> Optional[str] detected_encoding = chardet.detect(bytes_value).get("encoding") if detected_encoding: try: - value = bytes_value.decode(encoding=detected_encoding) - return value + # Decode the bytes value with the detected encoding and replace errors with "?" + # if bytes cannot be decoded e.g. b"\x66\x67\x67\x9c", if detected_encoding="utf-8" + # will result in 'foo�' (instead of failing) + str_value = bytes_value.decode( + encoding=detected_encoding, errors="replace" + ) + # Replace NULL_BYTE with empty string to avoid errors with + # the database client (should be O(n)) + str_value = ( + str_value.replace(NULL_BYTE, "") + if NULL_BYTE in str_value + else str_value + ) + return str_value except Exception as exc: - logger.debug("Failed to parse bytes valud as string: %s", exc) + logger.debug("Failed to parse bytes value as string: %s", exc) logger.debug(traceback.format_exc()) return value.hex() diff --git a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/snowflake/sampler.py b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/snowflake/sampler.py index a5c48c559b84..5757782b17d4 100644 --- a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/snowflake/sampler.py +++ b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/snowflake/sampler.py @@ -34,7 +34,6 @@ class SnowflakeSampler(SQASampler): run the query in the whole table. """ - # pylint: disable=too-many-arguments def __init__( self, client, diff --git a/ingestion/tests/unit/topology/dashboard/test_looker.py b/ingestion/tests/unit/topology/dashboard/test_looker.py index ea195f1c6ece..b5a2b2b0ab46 100644 --- a/ingestion/tests/unit/topology/dashboard/test_looker.py +++ b/ingestion/tests/unit/topology/dashboard/test_looker.py @@ -300,6 +300,53 @@ def test_clean_table_name(self): self.assertEqual(self.looker._clean_table_name("TABLE AS ALIAS"), "table") + def test_render_table_name(self): + """ + Check that table is rendered correctly if "openmetadata" or default condition apply, or no templating is present + """ + tagged_table_name_template = """ + {%- if openmetadata -%} + `BQ-project.dataset.sample_data` + {%- elsif prod -%} + `BQ-project.dataset.sample_data` + {%- elsif dev -%} + `BQ-project.{{_user_attributes['dbt_dev_schema']}}.sample_data` + {%- endif -%} + """ + default_table_name_template = """ + {%- if prod -%} + `BQ-project.dataset.sample_data` + {%- elsif dev -%} + `BQ-project.{{_user_attributes['dbt_dev_schema']}}.sample_data` + {%- else -%} + `BQ-project.dataset.sample_data` + {%- endif -%} + """ + untagged_table_name_template = """ + {%- if prod -%} + `BQ-project.dataset.sample_data` + {%- elsif dev -%} + `BQ-project.{{_user_attributes['dbt_dev_schema']}}.sample_data` + {%- endif -%} + """ + table_name_plain = "`BQ-project.dataset.sample_data`" + self.assertEqual( + self.looker._render_table_name(tagged_table_name_template), + "`BQ-project.dataset.sample_data`", + ) + self.assertEqual( + self.looker._render_table_name(default_table_name_template), + "`BQ-project.dataset.sample_data`", + ) + self.assertNotEqual( + self.looker._render_table_name(untagged_table_name_template), + "`BQ-project.dataset.sample_data`", + ) + self.assertEqual( + self.looker._render_table_name(table_name_plain), + "`BQ-project.dataset.sample_data`", + ) + def test_get_dashboard_sources(self): """ Check how we are building the sources