From 523a11dd3a97f8b27b52be51f2c845204f612e9b Mon Sep 17 00:00:00 2001 From: Anna Scholtz Date: Tue, 14 May 2024 13:06:50 -0700 Subject: [PATCH] Update metric hub docs with joins and wildcard information --- src/concepts/metric_hub.md | 61 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/src/concepts/metric_hub.md b/src/concepts/metric_hub.md index 87de9712e..c03db28bc 100644 --- a/src/concepts/metric_hub.md +++ b/src/concepts/metric_hub.md @@ -82,6 +82,27 @@ from_expression = """ submission_date_column = "submission_date" ``` +Data sources can be joined with other data sources: + +```toml +# Join the `baseline` data source with the `metrics` data source. +# Definitions for both data sources must exist. +[data_sources.baseline.joins.metrics] +relationship = "many_to_many" # this determines the type of JOIN used; options: many_to_many, one_to_one, one_to_many, many_to_one +on_expression = """ # SQL expression specifying the JOIN condition + baseline.client_id = metrics.client_id AND + baseline.submission_date = metrics.submission_date +""" +``` + +Wildcard character can be used to apply joins to multiple data sources: + +```toml +# Apply join to all data sources prefixed with user_ +[data_sources.user_'*'.joins.metrics] +# defaults are many_to_many relationship and joins on the client_id_column and submission_date_column +``` + ### `[metrics]` Section The metrics sections allows to specify metrics. A metric aggregates data and is associated with some data source. @@ -145,6 +166,16 @@ client_count = {} mean = {} ``` +Wildcard expressions can be used to express that a specific statistic should be available for multiple metrics: + +```toml +# All metrics with the bookmark_ prefix should have the mean computed +[metrics.bookmark_'*'.statistics.mean] + +# All metrics should have client counts computed (not recommended to apply statistic to every metric) +[metrics.'*'.statistics.client_count] +``` + New statistics need to be implemented inside the tooling that uses metric definitions. ### `[dimensions]` Section @@ -293,7 +324,7 @@ These explores look like the following: The side pane is split into different sections: -- **Base Fields**: This section contains dimensions that are useful for filtering or segmenting the population, like channel or operating system. These base fields are based on `clients_daily` tables. +- **Base Fields**: This section contains dimensions that are useful for filtering or segmenting the population, like channel or operating system. These base fields can be configured in metric-hub (see below). - **Metrics**: This section contains all metrics that are based on the data source represented by the explore. These metrics describe an aggregation of activities or measurements on a per-client basis. - **Statistics**: This sections contains the [statistics that have been defined in metric-hub on top of the metric definitions](https://github.com/mozilla/metric-hub/tree/main/looker) as measures. These statistics summarize the distribution of metrics within a specific time frame, population and/or segment and are used to derive insights and patterns from the raw metric data. Statistics have to be defined manually under the [`looker/` directory in metric-hub](https://github.com/mozilla/metric-hub/tree/main/looker). - **Sample of source data**: Defines the sample size that should be selected from the data source. Decreasing the sample size will speed up getting results in Looker, however it might decrease the accuracy. The results are being adjusted based on the sample size. For example, if a 1% sample is being used, then certain statistic results (like sum, count) will be multiplied by 100. @@ -301,7 +332,7 @@ The side pane is split into different sections: #### Getting Metrics into Looker -Metric definitions will be available in the "Metric Definition" explores for metrics that have been added to the [`defintions/` folder in metric-hub](https://github.com/mozilla/metric-hub/tree/main/definitions). +Metric definitions will be available in the "Metric Definition" explores for metrics that have been added to the [`definitions/` folder in metric-hub](https://github.com/mozilla/metric-hub/tree/main/definitions). Statistics on top of these metrics need to be defined in the [`looker/` folder in metric-hub](https://github.com/mozilla/metric-hub/tree/main/looker). Statistics currently supported by Looker are: @@ -316,6 +347,32 @@ Statistics on top of these metrics need to be defined in the [`looker/` folder i To get more statistics added, please reach out on the [#data-help](https://mozilla.slack.com/archives/C4D5ZA91B) Slack channel. +To filter and segment metrics in Looker, data sources that expose fields as dimensions can be configured in metric-hub. These base field data sources need to be joined with the metric data sources. Wildcard characters can be used to apply these joins to multiple data sources: + +```toml +[data_sources.looker_base_fields] +select_expression = """ + SELECT + submission_date, + client_id, + os, + country, + channel + FROM + mozdata.telemetry.clients_daily +""" +columns_as_dimensions = true # expose the selected fields as dimensions in Looker + +# Join `looker_base_fields` on to all the data sources in the file +# The selected fields in `looker_base_fields` will show up as dimensions for all the metrics +[data_sources.'*'.joins.looker_base_fields] + +# Overwrite the join, to allow for a different data source to be used as base field data source +[data_sources.baseline.joins.some_other_datasource] +relationship = "many_to_many" +on_expression = "baseline.client_id = some_other_datasource.client_id" +``` + #### Example Use Cases Some stakeholders would like to analyze crash metrics for Firefox Desktop in Looker. First, relevant metrics, such as number of socket crashes, need to be [added to `definitions/firefox_desktop.toml`](https://github.com/mozilla/metric-hub/blob/4ef7e2ef8a53c90f77a692af4c82ef31be8bf369/definitions/firefox_desktop.toml#L1577C10-L1593C11):