From 44c3cd32622203b21bc75168d58096d2e55f817a Mon Sep 17 00:00:00 2001 From: Alexis Weill Date: Wed, 9 Oct 2024 14:01:19 -0700 Subject: [PATCH 1/7] Update incremental-microbatch.md Update config in example `session_start` -> `page_view_start` --- website/docs/docs/build/incremental-microbatch.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/docs/build/incremental-microbatch.md b/website/docs/docs/build/incremental-microbatch.md index d200dd6e4b6..6561c44f547 100644 --- a/website/docs/docs/build/incremental-microbatch.md +++ b/website/docs/docs/build/incremental-microbatch.md @@ -51,7 +51,7 @@ We run the `sessions` model on October 1, 2024, and then again on October 2. It {{ config( materialized='incremental', incremental_strategy='microbatch', - event_time='session_start', + event_time='page_view_start', begin='2020-01-01', batch_size='day' ) }} From a737eeb9d12e0efd171fc280d2115bbe605b4810 Mon Sep 17 00:00:00 2001 From: Petro Tiurin <93913847+ptiurin@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:57:00 +0100 Subject: [PATCH 2/7] fix: Correct agg index configuration for Firebolt --- .../resource-configs/firebolt-configs.md | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/website/docs/reference/resource-configs/firebolt-configs.md b/website/docs/reference/resource-configs/firebolt-configs.md index 394823e33de..0ab14354003 100644 --- a/website/docs/reference/resource-configs/firebolt-configs.md +++ b/website/docs/reference/resource-configs/firebolt-configs.md @@ -38,8 +38,8 @@ models: +table_type: fact +primary_index: [ , ... ] +indexes: - - type: aggregating - key_column: [ , ... ] + - index_type: aggregating + key_columns: [ , ... ] aggregation: [ , ... ] ... ``` @@ -58,8 +58,8 @@ models: table_type: fact primary_index: [ , ... ] indexes: - - type: aggregating - key_column: [ , ... ] + - index_type: aggregating + key_columns: [ , ... ] aggregation: [ , ... ] ... ``` @@ -77,9 +77,9 @@ models: primary_index = [ "", ... ], indexes = [ { - type = "aggregating" - key_column = [ "", ... ], - aggregation = [ "", ... ], + "index_type": "aggregating" + "key_columns": [ "", ... ], + "aggregation": [ "", ... ], }, ... ] @@ -99,8 +99,8 @@ models: | `table_type` | Whether the materialized table will be a [fact or dimension](https://docs.firebolt.io/godocs/Overview/working-with-tables/working-with-tables.html#fact-and-dimension-tables) table. | | `primary_index` | Sets the primary index for the fact table using the inputted list of column names from the model. Required for fact tables. | | `indexes` | A list of aggregating indexes to create on the fact table. | -| `type` | Specifies that the index is an [aggregating index](https://docs.firebolt.io/godocs/Guides/working-with-indexes/using-aggregating-indexes.html). Should be set to `aggregating`. | -| `key_column` | Sets the grouping of the aggregating index using the inputted list of column names from the model. | +| `index_type` | Specifies that the index is an [aggregating index](https://docs.firebolt.io/godocs/Guides/working-with-indexes/using-aggregating-indexes.html). Should be set to `aggregating`. | +| `key_columns` | Sets the grouping of the aggregating index using the inputted list of column names from the model. | | `aggregation` | Sets the aggregations on the aggregating index using the inputted list of SQL agg expressions. | @@ -113,9 +113,9 @@ models: primary_index = "id", indexes = [ { - type: "aggregating", - key_column: "order_id", - aggregation: ["COUNT(DISTINCT status)", "AVG(customer_id)"] + "index_type": "aggregating", + "key_columns": "order_id", + "aggregation": ["COUNT(DISTINCT status)", "AVG(customer_id)"] } ] ) }} From d75e9d0653dd6c920553608fcb8993027100d67e Mon Sep 17 00:00:00 2001 From: Doug Beatty <44704949+dbeatty10@users.noreply.github.com> Date: Thu, 10 Oct 2024 17:49:45 -0600 Subject: [PATCH 3/7] Referenced nodes in defer --- website/docs/reference/node-selection/defer.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/reference/node-selection/defer.md b/website/docs/reference/node-selection/defer.md index 99dbea401b3..be55ab93d3d 100644 --- a/website/docs/reference/node-selection/defer.md +++ b/website/docs/reference/node-selection/defer.md @@ -31,7 +31,7 @@ dbt test --models [...] --defer --state path/to/artifacts When the `--defer` flag is provided, dbt will resolve `ref` calls differently depending on two criteria: 1. Is the referenced node included in the model selection criteria of the current run? -2. Does the reference node exist as a database object in the current environment? +2. Does the referenced node exist as a database object in the current environment? If the answer to both is **no**—a node is not included _and_ it does not exist as a database object in the current environment—references to it will use the other namespace instead, provided by the state manifest. From 4f0cccb4082e20b79cea826b52558e65859e60d8 Mon Sep 17 00:00:00 2001 From: Mirna Wong <89008547+mirnawong1@users.noreply.github.com> Date: Fri, 11 Oct 2024 10:57:11 +0100 Subject: [PATCH 4/7] Update incremental-microbatch.md add final select statement to show where `session_start` is coming from --- website/docs/docs/build/incremental-microbatch.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/website/docs/docs/build/incremental-microbatch.md b/website/docs/docs/build/incremental-microbatch.md index 6561c44f547..30d907a41cf 100644 --- a/website/docs/docs/build/incremental-microbatch.md +++ b/website/docs/docs/build/incremental-microbatch.md @@ -51,7 +51,7 @@ We run the `sessions` model on October 1, 2024, and then again on October 2. It {{ config( materialized='incremental', incremental_strategy='microbatch', - event_time='page_view_start', + event_time='session_start', begin='2020-01-01', batch_size='day' ) }} @@ -70,7 +70,13 @@ customers as ( ), -... +select + page_views.id as session_id, + page_views.page_view_start as session_start, + customers.* + from page_views + left join customers + on page_views.customer_id = customer.id ``` From 87e662c78554acc58b304146273a9fdfb0233df2 Mon Sep 17 00:00:00 2001 From: Mirna Wong <89008547+mirnawong1@users.noreply.github.com> Date: Fri, 11 Oct 2024 12:32:09 +0100 Subject: [PATCH 5/7] Update incremental-microbatch.md clarify session_start --- website/docs/docs/build/incremental-microbatch.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/website/docs/docs/build/incremental-microbatch.md b/website/docs/docs/build/incremental-microbatch.md index 30d907a41cf..2cc39e9e3b9 100644 --- a/website/docs/docs/build/incremental-microbatch.md +++ b/website/docs/docs/build/incremental-microbatch.md @@ -24,7 +24,7 @@ Each "batch" corresponds to a single bounded time period (by default, a single d ### Example -A `sessions` model is aggregating and enriching data that comes from two other models: +A `sessions` model aggregates and enriches data that comes from two other models. - `page_views` is a large, time-series table. It contains many rows, new records almost always arrive after existing ones, and existing records rarely update. - `customers` is a relatively small dimensional table. Customer attributes update often, and not in a time-based manner — that is, older customers are just as likely to change column values as newer customers. @@ -39,12 +39,15 @@ models: event_time: page_view_start ``` + We run the `sessions` model on October 1, 2024, and then again on October 2. It produces the following queries: +The `event_time` for the `sessions` model is set to `session_start`, which marks the beginning of a user’s session on the website. This setting allows dbt to combine multiple page views (each tracked by their own `page_view_start` timestamps) into a single session. This way, `session_start` differentiates the timing of individual page views from the broader timeframe of the entire user session. + ```sql @@ -147,7 +150,7 @@ customers as ( dbt will instruct the data platform to take the result of each batch query and insert, update, or replace the contents of the `analytics.sessions` table for the same day of data. To perform this operation, dbt will use the most efficient atomic mechanism for "full batch" replacement that is available on each data platform. -It does not matter whether the table already contains data for that day, or not. Given the same input data, no matter how many times a batch is reprocessed, the resulting table is the same. +It does not matter whether the table already contains data for that day. Given the same input data, the resulting table is the same no matter how many times a batch is reprocessed. @@ -181,11 +184,11 @@ During standard incremental runs, dbt will process batches according to the curr -**Note:** If there’s an upstream model that configures `event_time`, but you *don’t* want the reference to it to be filtered, you can specify `ref('upstream_model').render()` to opt-out of auto-filtering. This isn't generally recommended — most models which configure `event_time` are fairly large, and if the reference is not filtered, each batch will perform a full scan of this input table. +**Note:** If there’s an upstream model that configures `event_time`, but you *don’t* want the reference to it to be filtered, you can specify `ref('upstream_model').render()` to opt-out of auto-filtering. This isn't generally recommended — most models that configure `event_time` are fairly large, and if the reference is not filtered, each batch will perform a full scan of this input table. ### Backfills -Whether to fix erroneous source data, or retroactively apply a change in business logic, you may need to reprocess a large amount of historical data. +Whether to fix erroneous source data or retroactively apply a change in business logic, you may need to reprocess a large amount of historical data. Backfilling a microbatch model is as simple as selecting it to run or build, and specifying a "start" and "end" for `event_time`. As always, dbt will process the batches between the start and end as independent queries. @@ -210,7 +213,7 @@ For now, dbt assumes that all values supplied are in UTC: - `--event-time-start` - `--event-time-end` -While we may consider adding support for custom timezones in the future, we also believe that defining these values in UTC makes everyone's lives easier. +While we may consider adding support for custom time zones in the future, we also believe that defining these values in UTC makes everyone's lives easier. ## How `microbatch` compares to other incremental strategies? @@ -267,7 +270,7 @@ select * from {{ ref('stg_events') }} -- this ref will be auto-filtered -Where you’ve also set an `event_time` for the model’s direct parents - in this case `stg_events`: +Where you’ve also set an `event_time` for the model’s direct parents - in this case, `stg_events`: From b6b21c17e5109d0d291348d601363ed1d6e6bccd Mon Sep 17 00:00:00 2001 From: Mirna Wong <89008547+mirnawong1@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:02:51 +0100 Subject: [PATCH 6/7] Update model-contracts.md remove 'in the future' as dbt mesh is now enabled --- website/docs/docs/collaborate/govern/model-contracts.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/docs/collaborate/govern/model-contracts.md b/website/docs/docs/collaborate/govern/model-contracts.md index b07ce909480..d30024157c8 100644 --- a/website/docs/docs/collaborate/govern/model-contracts.md +++ b/website/docs/docs/collaborate/govern/model-contracts.md @@ -178,14 +178,14 @@ Currently, `not_null` and `check` constraints are enforced only after a model is ### Which models should have contracts? Any model meeting the criteria described above _can_ define a contract. We recommend defining contracts for ["public" models](model-access) that are being relied on downstream. -- Inside of dbt: Shared with other groups, other teams, and (in the future) other dbt projects. +- Inside of dbt: Shared with other groups, other teams, and [other dbt projects](/best-practices/how-we-mesh/mesh-1-intro). - Outside of dbt: Reports, dashboards, or other systems & processes that expect this model to have a predictable structure. You might reflect these downstream uses with [exposures](/docs/build/exposures). ### How are contracts different from tests? A model's contract defines the **shape** of the returned dataset. If the model's logic or input data doesn't conform to that shape, the model does not build. -[Data Tests](/docs/build/data-tests) are a more flexible mechanism for validating the content of your model _after_ it's built. So long as you can write the query, you can run the data test. Data tests are more configurable, such as with [custom severity thresholds](/reference/resource-configs/severity). They are easier to debug after finding failures, because you can query the already-built model, or [store the failing records in the data warehouse](/reference/resource-configs/store_failures). +[Data Tests](/docs/build/data-tests) are a more flexible mechanism for validating the content of your model _after_ it's built. So long as you can write the query, you can run the data test. Data tests are more configurable, such as with [custom severity thresholds](/reference/resource-configs/severity). They are easier to debug after finding failures because you can query the already-built model, or [store the failing records in the data warehouse](/reference/resource-configs/store_failures). In some cases, you can replace a data test with its equivalent constraint. This has the advantage of guaranteeing the validation at build time, and it probably requires less compute (cost) in your data platform. The prerequisites for replacing a data test with a constraint are: - Making sure that your data platform can support and enforce the constraint that you need. Most platforms only enforce `not_null`. From 8d36029092815f832a3a400d52f3c7e721ae6ec8 Mon Sep 17 00:00:00 2001 From: Matt Shaver <60105315+matthewshaver@users.noreply.github.com> Date: Fri, 11 Oct 2024 17:00:51 -0400 Subject: [PATCH 7/7] Coalesce 2024 announcements (#6282) ## What are you changing in this pull request and why? Documentation links for 2024 Coalesce announcements Expandable does not use a bullet point so it stands out Using the https links to help out with a new window instead of a lot of the back button and page reloads ## Checklist - [ ] I have reviewed the [Content style guide](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/content-style-guide.md) so my content adheres to these guidelines. - [ ] The topic I'm writing about is for specific dbt version(s) and I have versioned it according to the [version a whole page](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#adding-a-new-version) and/or [version a block of content](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#versioning-blocks-of-content) guidelines. - [ ] I have added checklist item(s) to this list for anything anything that needs to happen before this PR is merged, such as "needs technical review" or "change base branch." --------- Co-authored-by: Ly Nguyen <107218380+nghi-ly@users.noreply.github.com> --- .../docs/docs/dbt-versions/release-notes.md | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/website/docs/docs/dbt-versions/release-notes.md b/website/docs/docs/dbt-versions/release-notes.md index 456d1bf0e82..662fd0f381a 100644 --- a/website/docs/docs/dbt-versions/release-notes.md +++ b/website/docs/docs/dbt-versions/release-notes.md @@ -20,6 +20,31 @@ Release notes are grouped by month for both multi-tenant and virtual private clo ## October 2024 + + + Documentation for new features and functionality announced at Coalesce 2024: + + - Iceberg table support for [Snowflake](https://docs.getdbt.com/reference/resource-configs/snowflake-configs#iceberg-table-format) + - [Athena](https://docs.getdbt.com/reference/resource-configs/athena-configs) and [Teradata](https://docs.getdbt.com/reference/resource-configs/teradata-configs) adapter support in dbt Cloud + - dbt Cloud now hosted on [Azure](https://docs.getdbt.com/docs/cloud/about-cloud/access-regions-ip-addresses) + - Get comfortable with [Versionless dbt Cloud](https://docs.getdbt.com/docs/dbt-versions/versionless-cloud) + - Scalable [microbatch incremental models](https://docs.getdbt.com/docs/build/incremental-microbatch) + - Advanced CI [features](https://docs.getdbt.com/docs/deploy/advanced-ci) + - [Linting with CI jobs](https://docs.getdbt.com/docs/deploy/continuous-integration#sql-linting) + - dbt Assist is now [dbt Copilot](https://docs.getdbt.com/docs/cloud/dbt-copilot) + - Developer blog on [Snowflake Feature Store and dbt: A bridge between data pipelines and ML](https://docs.getdbt.com/blog/snowflake-feature-store) + - New [Quickstart for dbt Cloud CLI](https://docs.getdbt.com/guides/dbt-cloud-cli?step=1) + - [Auto-exposures with Tableau](https://docs.getdbt.com/docs/collaborate/auto-exposures) + - Semantic Layer integration with [Excel desktop and M365](https://docs.getdbt.com/docs/cloud-integrations/semantic-layer/excel) + - [Data health tiles](https://docs.getdbt.com/docs/collaborate/data-tile) + - [Semantic Layer and Cloud IDE integration](https://docs.getdbt.com/docs/build/metricflow-commands#metricflow-commands) + - Query history in [Explorer](https://docs.getdbt.com/docs/collaborate/model-query-history#view-query-history-in-explorer) + - Semantic Layer Metricflow improvements, including [improved granularity and custom calendar](https://docs.getdbt.com/docs/build/metricflow-time-spine#custom-calendar) + - [Python SDK](https://docs.getdbt.com/docs/dbt-cloud-apis/sl-python) is now generally available + + + + - **New**: The [dbt Semantic Layer Python software development kit](/docs/dbt-cloud-apis/sl-python) is now [generally available](/docs/dbt-versions/product-lifecycles). It provides users with easy access to the dbt Semantic Layer with Python and enables developers to interact with the dbt Semantic Layer APIs to query metrics/dimensions in downstream tools. - **Enhancement**: You can now add a description to a singular data test in dbt Cloud Versionless. Use the [`description` property](/reference/resource-properties/description) to document [singular data tests](/docs/build/data-tests#singular-data-tests). You can also use [docs block](/docs/build/documentation#using-docs-blocks) to capture your test description. The enhancement will be included in upcoming dbt Core 1.9 release. - **New**: Introducing the [microbatch incremental model strategy](/docs/build/incremental-microbatch) (beta), available in dbt Cloud Versionless and will soon be supported in dbt Core 1.9. The microbatch strategy allows for efficient, batch-based processing of large time-series datasets for improved performance and resiliency, especially when you're working with data that changes over time (like new records being added daily). To enable this feature in dbt Cloud, set the `DBT_EXPERIMENTAL_MICROBATCH` environment variable to `true` in your project.